In [65]:
%matplotlib inline
import numpy as np
import pandas as pd
import csv
from sklearn.svm import SVC
from itertools import product
from sklearn.metrics import accuracy_score, confusion_matrix

with open('benchmarkdataset_train.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
bench_train = pd.DataFrame(data)

print(bench_train)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix_train = np.zeros((len(sequences), len(amino_acids)))
percentage_matrix_train = []

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix_train[i, j] = sequence.count(amino_acid)
    
    percentage_matrix_train.append(count_matrix_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_train = np.array(percentage_matrix_train)
print('percentage_train in 1-D:')
print(percentage_matrix_train)

print()
# 顯示比例矩陣
df_percentage_train = pd.DataFrame(percentage_matrix_train, columns=list(amino_acids))
print(df_percentage_train)

# 合併特徵資料和目標資料
df_combined_train = pd.concat([df_percentage_train, bench_train['Header']], axis=1)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_train['Target'] = df_combined_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_train, y_train = df_combined_train.drop(['Header', 'Target'], axis=1), df_combined_train['Target']

#print(X_train)
#print(y_train)

#--------------------------------------------------------------
# 創建所有二個胺基酸相連的標籤
DPC = [a + b for a in amino_acids for b in amino_acids]

# 計算每對胺基酸的出現次數
count_DPC_train = np.zeros((len(sequences), len(DPC)))
percentage_DPC_train = []

for i, sequence in enumerate(sequences):
    for j, aa_pair in enumerate(DPC):
        count_DPC_train[i, j] = sequence.count(aa_pair)
        
    percentage_DPC_train.append(count_DPC_train[i]/len(sequence))

# 計算每對胺基酸在序列中的比例
percentage_DPC_train = np.array(percentage_DPC_train)
print('percentage_DPC_train in 1-D:')
print(percentage_DPC_train)

# 顯示比例矩陣
df_percentage_DPC_train = pd.DataFrame(percentage_DPC_train, columns=DPC)
print(df_percentage_DPC_train)

# 合併特徵資料和目標資料
df_combined_DPC_train = pd.concat([df_percentage_train, df_percentage_DPC_train, bench_train['Header']], axis=1)
#df_combined_DPC_train = pd.concat([df_percentage_DPC_train, bench_train['Header']], axis=1)
print(df_combined_DPC_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_DPC_train['Target'] = df_combined_DPC_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_DPC_train, y_DPC_train = df_combined_DPC_train.drop(['Header', 'Target'], axis=1), df_combined_DPC_train['Target']
#print(X_DPC_train)

#--------------------------------------------------------------
# 產生所有三種胺基酸的排列組合
TPC = [''.join(comb) for comb in product(amino_acids, repeat=3)]

# 計算每種胺基酸的出現次數
count_TPC_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_train = []

for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_train[i, j] = sequence.count(amino_acid_combination)
        
    percentage_TPC_train.append(count_TPC_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_train = np.array(percentage_TPC_train)
print('percentage_TPC_train in 1-D:')
print(percentage_TPC_train)

# 顯示比例矩陣
df_percentage_TPC_train = pd.DataFrame(percentage_TPC_train, columns=TPC)
print(df_percentage_TPC_train)

# 合併特徵資料和目標資料
df_combined_TPC_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_train, bench_train['Header']], axis=1)
print(df_combined_TPC_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_train['Target'] = df_combined_TPC_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_train, y_TPC_train = df_combined_TPC_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_train['Target']


#--------------------------------------------------------------
# 將DPC抓到比較好的2個胺基酸加進DPC的feature

# 初始化 CN 和 CS 的特徵矩陣
combinations_CN = ['CN' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CN' for comb in product(amino_acids, repeat=1)]
combinations_CS = ['CS' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CS' for comb in product(amino_acids, repeat=1)]

all_combinations = combinations_CN + combinations_CS

count_matrix_total = np.zeros((len(sequences), len(all_combinations)))
percentage_matrix_total = []

for i, sequence in enumerate(sequences):
    for j, combo in enumerate(all_combinations):
        count_matrix_total[i, j] = sequence.count(combo)
    
    percentage_matrix_total.append(count_matrix_total[i] / len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_total = np.array(percentage_matrix_total)
print('percentage_combination_train in 1-D:')
print(percentage_matrix_total)

df_percentage_total = pd.DataFrame(percentage_matrix_total, columns=all_combinations)
print(df_percentage_total)

# 將 CN、CS 開頭的特徵和 DPC 特徵合併
df_combined_total = pd.concat([df_percentage_total, df_percentage_DPC_train, bench_train['Header']], axis=1)
print(df_combined_total)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_total['Target'] = df_combined_total['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_combination_train, y_combination_train = df_combined_total.drop(['Header', 'Target'], axis=1), df_combined_total['Target']


# --------------------------------------------------------------
# 產生所有TPC穿插一個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_4_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_4_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_4_train[i, j] = 0
        for k in amino_acid :
            # 看起來很愚蠢但這是最不吃效能的方法
            count_TPC_4_train[i, j] += sequence.count( k + amino_acid_combination )
            count_TPC_4_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] )
            count_TPC_4_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] )
            count_TPC_4_train[i, j] += sequence.count( amino_acid_combination + k )
    percentage_TPC_4_train.append(count_TPC_4_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_4_train = np.array(percentage_TPC_4_train)

# 顯示比例矩陣
df_percentage_TPC_4_train = pd.DataFrame(percentage_TPC_4_train, columns=TPC)
print('TPC_4:')
print(df_percentage_TPC_4_train)

# 合併特徵資料和目標資料
df_combined_TPC_4_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_4_train, bench_train['Header']], axis=1)
print('AAC + DPC + TPC_4:')
print(df_combined_TPC_4_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_4_train['Target'] = df_combined_TPC_4_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_4_train, y_TPC_4_train = df_combined_TPC_4_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_4_train['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插兩個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_5_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_5_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_5_train[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                count_TPC_5_train[i, j] += sequence.count( k + l + amino_acid_combination )
                count_TPC_5_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] )
                count_TPC_5_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] )
                count_TPC_5_train[i, j] += sequence.count( k + amino_acid_combination + l )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2:] )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination + k + l )
    percentage_TPC_5_train.append(count_TPC_5_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_5_train = np.array(percentage_TPC_5_train)

# 顯示比例矩陣
df_percentage_TPC_5_train = pd.DataFrame(percentage_TPC_5_train, columns=TPC)
print('TPC_5:')
print(df_percentage_TPC_5_train)

# 合併特徵資料和目標資料
df_combined_TPC_5_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_5_train, bench_train['Header']], axis=1)
print('AAC + DPC + TPC_5:')
print(df_combined_TPC_5_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_5_train['Target'] = df_combined_TPC_5_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_5_train, y_TPC_5_train = df_combined_TPC_5_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_5_train['Target']


# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_6_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_6_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_6_train[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    count_TPC_6_train[i, j] += sequence.count( k + l + m + amino_acid_combination )
                    count_TPC_6_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] )
                    count_TPC_6_train[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( k + l + amino_acid_combination + m )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination + l + m )

                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m )

                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m )

                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination + k + l + m )
    percentage_TPC_6_train.append(count_TPC_6_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_6_train = np.array(percentage_TPC_6_train)

# 顯示比例矩陣hg
df_percentage_TPC_6_train = pd.DataFrame(percentage_TPC_6_train, columns=TPC)
print('TPC_6:')
print(df_percentage_TPC_6_train)

# 合併特徵資料和目標資料
df_combined_TPC_6_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_6_train, bench_train['Header']], axis=1)
print('AAC + DPC + TPC_6:')
print(df_combined_TPC_6_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_6_train['Target'] = df_combined_TPC_6_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_6_train, y_TPC_6_train = df_combined_TPC_6_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_6_train['Target']


# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_7_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_7_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_7_train[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    for n in amino_acid :
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + n + amino_acid_combination )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + n + amino_acid_combination[1:] )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + n + amino_acid_combination[2] )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination + k + l + m + n )
                        
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + amino_acid_combination[0] + n + amino_acid_combination[1:])
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + amino_acid_combination[:1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + amino_acid_combination + n )
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + n + amino_acid_combination[1:])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination + l + m + n )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m + n)
                        
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + n + amino_acid_combination[1:])
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m + n)
                        
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] + n)
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
    percentage_TPC_7_train.append(count_TPC_7_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_7_train = np.array(percentage_TPC_7_train)

# 顯示比例矩陣hg
df_percentage_TPC_7_train = pd.DataFrame(percentage_TPC_7_train, columns=TPC)
print('TPC_7:')
print(df_percentage_TPC_7_train)

# 合併特徵資料和目標資料
df_combined_TPC_7_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_7_train, bench_train['Header']], axis=1)
print('AAC + DPC + TPC_7:')
print(df_combined_TPC_7_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_7_train['Target'] = df_combined_TPC_7_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_7_train, y_TPC_7_train = df_combined_TPC_7_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_7_train['Target']


      Header                Sequence
0      >AA29         FLKDHRISTFKNWPF
1      >AA30    FLSSRLQDLYSIVRRADRAA
2      >AA31            GDVIDTDRDIDR
3      >AA32          GFHDHGPCDPPSHK
4      >AA33       GHRATSDLASTGEESQD
..       ...                     ...
208  >neg131       VVRLAREPGKRESRYMH
209  >neg132       YEDLRDESLKGLVDIGF
210  >neg133  YFLIQSVSSTVMLLNGLYIFVN
211  >neg134         YGEPGMQLFVYGREE
212  >neg135    YNLSDTIKAFSILLLTDLCI

[213 rows x 2 columns]

percentage_train in 1-D:
[[0.         0.         0.06666667 ... 0.         0.06666667 0.        ]
 [0.15       0.         0.1        ... 0.05       0.         0.05      ]
 [0.         0.         0.41666667 ... 0.08333333 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.13636364 0.         0.09090909]
 [0.         0.         0.         ... 0.06666667 0.         0.13333333]
 [0.05       0.05       0.1        ... 0.         0.         0.05      ]]

            A         C         D         E         F        

TPC_4:
     AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  \
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
208  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
209  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
210  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
211  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
212  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

     YYQ  YYR  YYS  YYT  YYV  YYW  YYY  
0    0.0  0.0  0.0  0.0  0.

TPC_7:
     AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  \
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
208  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
209  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
210  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
211  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
212  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

     YYQ  YYR  YYS  YYT  YYV  YYW  YYY  
0    0.0  0.0  0.0  0.0  0.

In [66]:
with open('benchmarkdataset_test.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
bench_test = pd.DataFrame(data)

#print(bench_test)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix_test = np.zeros((len(sequences), len(amino_acids)))
percentage_matrix_test = []

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix_test[i, j] = sequence.count(amino_acid)
    percentage_matrix_test.append(count_matrix_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_test = np.array(percentage_matrix_test)
print('percentage_test in 1-D:')
print(percentage_matrix_test)

print()
# 顯示比例矩陣
df_percentage_test = pd.DataFrame(percentage_matrix_test, columns=list(amino_acids))
#print(df_percentage_test)

# 合併特徵資料和目標資料
df_combined_test = pd.concat([df_percentage_test, bench_test['Header']], axis=1)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_test['Target'] = df_combined_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_test, y_test = df_combined_test.drop(['Header', 'Target'], axis=1), df_combined_test['Target']

#print(X_test)
#print(y_test)
#print(y_test.value_counts())

#--------------------------------------------------------------
# 計算每對胺基酸的出現次數
count_DPC_test = np.zeros((len(sequences), len(DPC)))
percentage_DPC_test = []

for i, sequence in enumerate(sequences):
    for j, aa_pair in enumerate(DPC):
        count_DPC_test[i, j] = sequence.count(aa_pair)
        
    percentage_DPC_test.append(count_DPC_test[i]/len(sequence))

# 計算每對胺基酸在序列中的比例
percentage_DPC_test = np.array(percentage_DPC_test)
print('percentage_DPC_train in 1-D:')
print(percentage_DPC_test)

# 顯示比例矩陣
df_percentage_DPC_test = pd.DataFrame(percentage_DPC_test, columns=DPC)
print(df_percentage_DPC_test)

# 合併特徵資料和目標資料
df_combined_DPC_test = pd.concat([df_percentage_test, df_percentage_DPC_test, bench_test['Header']], axis=1)
#df_combined_DPC_test = pd.concat([df_percentage_DPC_test, bench_test['Header']], axis=1)
print(df_combined_DPC_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_DPC_test['Target'] = df_combined_DPC_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_DPC_test, y_DPC_test = df_combined_DPC_test.drop(['Header', 'Target'], axis=1), df_combined_DPC_test['Target']

#--------------------------------------------------------------
# 產生所有三種胺基酸的排列組合
TPC = [''.join(comb) for comb in product(amino_acids, repeat=3)]

# 計算每種胺基酸的出現次數
count_TPC_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_test = []

for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_test[i, j] = sequence.count(amino_acid_combination)
        
    percentage_TPC_test.append(count_TPC_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_test = np.array(percentage_TPC_test)
print('percentage_TPC_test in 1-D:')
print(percentage_TPC_test)

# 顯示比例矩陣
df_percentage_TPC_test = pd.DataFrame(percentage_TPC_test, columns=TPC)
print(df_percentage_TPC_test)

# 合併特徵資料和目標資料
df_combined_TPC_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_test, bench_test['Header']], axis=1)
print(df_combined_TPC_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_test['Target'] = df_combined_TPC_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_test, y_TPC_test = df_combined_TPC_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_test['Target']


#--------------------------------------------------------------
# 將DPC抓到比較好的2個胺基酸加進DPC的feature

# 初始化 CN 和 CS 的特徵矩陣
combinations_CN = ['CN' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CN' for comb in product(amino_acids, repeat=1)]
combinations_CS = ['CS' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CS' for comb in product(amino_acids, repeat=1)]

all_combinations = combinations_CN + combinations_CS

count_matrix_total = np.zeros((len(sequences), len(all_combinations)))
percentage_matrix_total = []

for i, sequence in enumerate(sequences):
    for j, combo in enumerate(all_combinations):
        count_matrix_total[i, j] = sequence.count(combo)
    
    percentage_matrix_total.append(count_matrix_total[i] / len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_total = np.array(percentage_matrix_total)
print('percentage_combination_test in 1-D:')
print(percentage_matrix_total)

df_percentage_total = pd.DataFrame(percentage_matrix_total, columns=all_combinations)
print(df_percentage_total)

# 將 CN、CS 開頭的特徵和 DPC 特徵合併
df_combined_total = pd.concat([df_percentage_total, df_percentage_DPC_test, bench_test['Header']], axis=1)
print(df_combined_total)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_total['Target'] = df_combined_total['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_combination_test, y_combination_test = df_combined_total.drop(['Header', 'Target'], axis=1), df_combined_total['Target']

# --------------------------------------------------------------
# 產生所有TPC穿插一個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_4_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_4_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_4_test[i, j] = 0
        for k in amino_acid :
            # 看起來很愚蠢但這是最不吃效能的方法
            count_TPC_4_test[i, j] += sequence.count( k + amino_acid_combination )
            count_TPC_4_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] )
            count_TPC_4_test[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] )
            count_TPC_4_test[i, j] += sequence.count( amino_acid_combination + k )
    percentage_TPC_4_test.append(count_TPC_4_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_4_test = np.array(percentage_TPC_4_test)

# 顯示比例矩陣
df_percentage_TPC_4_test = pd.DataFrame(percentage_TPC_4_test, columns=TPC)
print('TPC_4:')
print(df_percentage_TPC_4_test)

# 合併特徵資料和目標資料
df_combined_TPC_4_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_4_test, bench_test['Header']], axis=1)
print('AAC + DPC + TPC_4:')
print(df_combined_TPC_4_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_4_test['Target'] = df_combined_TPC_4_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_4_test, y_TPC_4_test = df_combined_TPC_4_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_4_test['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插兩個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_5_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_5_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_5_test[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                count_TPC_5_test[i, j] += sequence.count( k + l + amino_acid_combination )
                count_TPC_5_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] )
                count_TPC_5_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] )
                count_TPC_5_test[i, j] += sequence.count( k + amino_acid_combination + l )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2:] )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0:1] + k + l + amino_acid_combination[2] )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0:1] + k + amino_acid_combination[2] + l )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination + k + l )
    percentage_TPC_5_test.append(count_TPC_5_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_5_test = np.array(percentage_TPC_5_test)

# 顯示比例矩陣
df_percentage_TPC_5_test = pd.DataFrame(percentage_TPC_5_test, columns=TPC)
print('TPC_5:')
print(df_percentage_TPC_5_test)

# 合併特徵資料和目標資料
df_combined_TPC_5_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_5_test, bench_test['Header']], axis=1)
print('AAC + DPC + TPC_5:')
print(df_combined_TPC_5_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_5_test['Target'] = df_combined_TPC_5_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_5_test, y_TPC_5_test = df_combined_TPC_5_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_5_test['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_6_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_6_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_6_test[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    count_TPC_6_test[i, j] += sequence.count( k + l + m + amino_acid_combination )
                    count_TPC_6_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] )
                    count_TPC_6_test[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( k + l + amino_acid_combination + m )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination + l + m )

                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m )

                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m )

                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination + k + l + m )
    percentage_TPC_6_test.append(count_TPC_6_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_6_test = np.array(percentage_TPC_6_test)

# 顯示比例矩陣hg
df_percentage_TPC_6_test = pd.DataFrame(percentage_TPC_6_test, columns=TPC)
print('TPC_6:')
print(df_percentage_TPC_6_test)

# 合併特徵資料和目標資料
df_combined_TPC_6_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_6_test, bench_test['Header']], axis=1)
print('AAC + DPC + TPC_6:')
print(df_combined_TPC_6_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_6_test['Target'] = df_combined_TPC_6_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_6_test, y_TPC_6_test = df_combined_TPC_6_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_6_test['Target']


# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_7_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_7_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_7_test[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    for n in amino_acid :
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + n + amino_acid_combination )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + n + amino_acid_combination[1:] )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + n + amino_acid_combination[2] )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination + k + l + m + n )
                        
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + amino_acid_combination[0] + n + amino_acid_combination[1:])
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + amino_acid_combination[:1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + amino_acid_combination + n )
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + n + amino_acid_combination[1:])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination + l + m + n )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m + n)
                        
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + n + amino_acid_combination[1:])
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m + n)
                        
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] + n)
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
    percentage_TPC_7_test.append(count_TPC_7_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_7_test = np.array(percentage_TPC_7_test)

# 顯示比例矩陣hg
df_percentage_TPC_7_test = pd.DataFrame(percentage_TPC_7_test, columns=TPC)
print('TPC_7:')
print(df_percentage_TPC_7_test)

# 合併特徵資料和目標資料
df_combined_TPC_7_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_7_test, bench_test['Header']], axis=1)
print('AAC + DPC + TPC_7:')
print(df_combined_TPC_7_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_7_test['Target'] = df_combined_TPC_7_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_7_test, y_TPC_7_test = df_combined_TPC_7_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_7_test['Target']


percentage_test in 1-D:
[[0.15789474 0.10526316 0.         ... 0.         0.         0.        ]
 [0.05882353 0.         0.05882353 ... 0.05882353 0.02941176 0.        ]
 [0.35       0.         0.05       ... 0.05       0.         0.        ]
 ...
 [0.23076923 0.         0.07692308 ... 0.         0.         0.        ]
 [0.16666667 0.         0.04166667 ... 0.08333333 0.         0.04166667]
 [0.12       0.02       0.06       ... 0.06       0.         0.02      ]]

percentage_DPC_train in 1-D:
[[0.05263158 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.05       0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.07692308 ... 0.         0.         0.        ]
 [0.04166667 0.         0.04166667 ... 0.         0.         0.        ]
 [0.02       0.         0.         ... 0.         0.         0.        ]]
          AA        AC        AD        AE        AF     

         AAA  AAC       AAD       AAE  AAF  AAG  AAH  AAI       AAK       AAL  \
0   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
1   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
2   0.050000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
3   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
4   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
5   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
6   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
7   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
8   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
9   0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
10  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.000000  0.000000   
11  0.000000  0.0  0.000000 

TPC_4:
    AAA  AAC  AAD       AAE  AAF  AAG  AAH  AAI  AAK   AAL  ...  YYM  YYN  \
0   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
1   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
2   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
3   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
4   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
5   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
6   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
7   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
8   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
9   0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
10  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0  0.0   
11  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.00  ...  0.0 

TPC_5:
    AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  YYQ  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.

TPC_6:
    AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  YYQ  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.

TPC_7:
    AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  YYQ  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
import time

# 定義參數範圍
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_classifier = RandomForestClassifier()
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
execution_time = end_time - start_time
print(f"Grid Search Execution Time: {execution_time:.2f} seconds")

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_train, y_train)
selected_features = X_train.columns[sfm.get_support()]

X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

# 印出篩選後的特徵
print("Selected Features:", selected_features)

# 將模型擬合到篩選後的訓練數據上
best_model.fit(X_train_selected, y_train)

y_pred = best_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')

print('-------------------------------------------------')
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
execution_time = end_time - start_time
print(f"Grid Search Execution Time: {execution_time:.2f} seconds")

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_DPC_train, y_DPC_train)
selected_features = X_DPC_train.columns[sfm.get_support()]

X_DPC_train_selected = sfm.transform(X_DPC_train)
X_DPC_test_selected = sfm.transform(X_DPC_test)

# 印出篩選後的特徵
print("DPC Selected Features:", selected_features)

best_model.fit(X_DPC_train_selected, y_DPC_train)

y_DPC_pred = best_model.predict(X_DPC_test_selected)
accuracy_DPC = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy_DPC * 100:.2f}%')

print('-------------------------------------------------')
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
execution_time = end_time - start_time
print(f"Grid Search Execution Time: {execution_time:.2f} seconds")

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_train, y_TPC_train)
selected_features = X_TPC_train.columns[sfm.get_support()]

X_TPC_train_selected = sfm.transform(X_TPC_train)
X_TPC_test_selected = sfm.transform(X_TPC_test)

# 印出篩選後的特徵
print("TPC Selected Features:", selected_features)

best_model.fit(X_TPC_train_selected, y_TPC_train)

y_TPC_pred = best_model.predict(X_TPC_test_selected)
accuracy_TPC = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy_TPC * 100:.2f}%')

Grid Search Execution Time: 134.12 seconds
Best Parameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Selected Features: Index(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
       'R', 'S', 'T', 'V', 'W', 'Y'],
      dtype='object')
Model Accuracy(AAC): 69.09%
-------------------------------------------------
Grid Search Execution Time: 134.46 seconds
Best Parameters: {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
DPC Selected Features: Index(['A', 'C', 'D', 'E', 'H', 'I', 'L', 'P', 'S', 'T', 'V', 'CG', 'GR', 'IV',
       'RT', 'TP', 'VS', 'WT'],
      dtype='object')
Model Accuracy(DPC): 69.09%
-------------------------------------------------
Grid Search Execution Time: 131.83 seconds
Best Parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
TPC Selected Feature

In [273]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()

# Single
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# DPC
rf_classifier.fit(X_DPC_train, y_DPC_train)
y_DPC_pred = rf_classifier.predict(X_DPC_test)

accuracy_DPC = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy_DPC * 100:.2f}%')


# TPC
rf_classifier.fit(X_TPC_train, y_TPC_train)
y_TPC_pred = rf_classifier.predict(X_TPC_test)

accuracy_TPC = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy_TPC * 100:.2f}%')


# TPC
rf_classifier.fit(X_combination_train, y_combination_train)
y_combination_pred = rf_classifier.predict(X_combination_test)

accuracy_combination = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy_combination * 100:.2f}%')


# TPC_4
rf_classifier.fit(X_TPC_4_train, y_TPC_4_train)
y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test)

accuracy_TPC_4 = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy_TPC_4 * 100:.2f}%')

# TPC_5
rf_classifier.fit(X_TPC_5_train, y_TPC_5_train)
y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test)

accuracy_TPC_5 = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy_TPC_5 * 100:.2f}%')

# TPC_6
rf_classifier.fit(X_TPC_6_train, y_TPC_6_train)
y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test)

accuracy_TPC_6 = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy_TPC_6 * 100:.2f}%')

# TPC_7
rf_classifier.fit(X_TPC_7_train, y_TPC_7_train)
y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test)

accuracy_TPC_7 = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy_TPC_7 * 100:.2f}%')

Model Accuracy(AAC): 60.00%
Model Accuracy(DPC): 60.00%
Model Accuracy(TPC): 76.36%
Model Accuracy(combination): 61.82%
Model Accuracy(TPC_4): 58.18%
Model Accuracy(TPC_5): 60.00%
Model Accuracy(TPC_6): 67.27%
Model Accuracy(TPC_7): 49.09%


In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

rf_classifier = RandomForestClassifier()


# --------------------------------------------------------------
rf_classifier.fit(X_train, y_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_train, y_train)

X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

selected_features = X_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("AAC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_train_selected, y_train)

y_pred = rf_classifier.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')



# --------------------------------------------------------------
rf_classifier.fit(X_DPC_train, y_DPC_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_DPC_train, y_DPC_train)

X_DPC_train_selected = sfm.transform(X_DPC_train)
X_DPC_test_selected = sfm.transform(X_DPC_test)

selected_features = X_DPC_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("DPC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_DPC_train_selected, y_DPC_train)

y_DPC_pred = rf_classifier.predict(X_DPC_test_selected)
accuracy_DPC = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy_DPC * 100:.2f}%')


# --------------------------------------------------------------
rf_classifier.fit(X_TPC_train, y_TPC_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_train, y_TPC_train)

X_TPC_train_selected = sfm.transform(X_TPC_train)
X_TPC_test_selected = sfm.transform(X_TPC_test)

selected_features = X_TPC_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_train_selected, y_TPC_train)

y_TPC_pred = rf_classifier.predict(X_TPC_test_selected)
accuracy_TPC = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy_TPC * 100:.2f}%')


# --------------------------------------------------------------
rf_classifier.fit(X_combination_train, y_combination_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_combination_train, y_combination_train)

X_combination_train_selected = sfm.transform(X_combination_train)
X_combination_test_selected = sfm.transform(X_combination_test)

selected_features = X_combination_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_combination_train_selected, y_combination_train)

y_combination_pred = rf_classifier.predict(X_combination_test_selected)
accuracy_combination = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy_combination * 100:.2f}%')

# --------------------------------------------------------------
# TPC_4
rf_classifier.fit(X_TPC_4_train, y_TPC_4_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_4_train, y_TPC_4_train)

X_TPC_4_train_selected = sfm.transform(X_TPC_4_train)
X_TPC_4_test_selected = sfm.transform(X_TPC_4_test)

selected_features = X_TPC_4_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_4 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_4_train_selected, y_TPC_4_train)

y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test_selected)
accuracy_TPC_4 = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy_TPC_4 * 100:.2f}%')


# --------------------------------------------------------------
# TPC_5
rf_classifier.fit(X_TPC_5_train, y_TPC_5_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_5_train, y_TPC_5_train)

X_TPC_5_train_selected = sfm.transform(X_TPC_5_train)
X_TPC_5_test_selected = sfm.transform(X_TPC_5_test)

selected_features = X_TPC_5_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_5 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_5_train_selected, y_TPC_5_train)

y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test_selected)
accuracy_TPC_5 = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy_TPC_5 * 100:.2f}%')


# --------------------------------------------------------------
# TPC_6
rf_classifier.fit(X_TPC_6_train, y_TPC_6_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_6_train, y_TPC_6_train)

X_TPC_6_train_selected = sfm.transform(X_TPC_6_train)
X_TPC_6_test_selected = sfm.transform(X_TPC_6_test)

selected_features = X_TPC_6_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_6 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_6_train_selected, y_TPC_6_train)

y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test_selected)
accuracy_TPC_6 = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy_TPC_6 * 100:.2f}%')


# --------------------------------------------------------------
# TPC_7
rf_classifier.fit(X_TPC_7_train, y_TPC_7_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_7_train, y_TPC_7_train)

X_TPC_7_train_selected = sfm.transform(X_TPC_7_train)
X_TPC_7_test_selected = sfm.transform(X_TPC_7_test)

selected_features = X_TPC_7_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_7 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_7_train_selected, y_TPC_7_train)

y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test_selected)
accuracy_TPC_7 = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy_TPC_7 * 100:.2f}%')

Model Accuracy(AAC): 70.00%
Model Accuracy(DPC): 62.50%
Model Accuracy(TPC): 70.00%
Model Accuracy(combination): 70.00%
Model Accuracy(TPC_4): 67.50%
Model Accuracy(TPC_5): 62.50%
Model Accuracy(TPC_6): 70.00%
Model Accuracy(TPC_7): 57.50%


In [306]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# -------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.6)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_pca, y_train)

y_pred = rf_classifier.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_DPC_train_scaled = scaler.fit_transform(X_DPC_train)
X_DPC_test_scaled = scaler.transform(X_DPC_test)

pca = PCA(n_components=0.6)

X_DPC_train_pca = pca.fit_transform(X_DPC_train_scaled)
X_DPC_test_pca = pca.transform(X_DPC_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_DPC_train_pca, y_DPC_train)

y_DPC_pred = rf_classifier.predict(X_DPC_test_pca)
accuracy = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_TPC_train_scaled = scaler.fit_transform(X_TPC_train)
X_TPC_test_scaled = scaler.transform(X_TPC_test)

pca = PCA(n_components=0.6)

X_TPC_train_pca = pca.fit_transform(X_TPC_train_scaled)
X_TPC_test_pca = pca.transform(X_TPC_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_train_pca, y_TPC_train)

y_TPC_pred = rf_classifier.predict(X_TPC_test_pca)
accuracy = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_combination_train_scaled = scaler.fit_transform(X_combination_train)
X_combination_test_scaled = scaler.transform(X_combination_test)

pca = PCA(n_components=0.6)

X_combination_train_pca = pca.fit_transform(X_combination_train_scaled)
X_combination_test_pca = pca.transform(X_combination_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_combination_train_pca, y_combination_train)

y_combination_pred = rf_classifier.predict(X_combination_test_pca)
accuracy = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_4
scaler = StandardScaler()
X_TPC_4_train_scaled = scaler.fit_transform(X_TPC_4_train)
X_TPC_4_test_scaled = scaler.transform(X_TPC_4_test)

pca = PCA(n_components=0.6)

X_TPC_4_train_pca = pca.fit_transform(X_TPC_4_train_scaled)
X_TPC_4_test_pca = pca.transform(X_TPC_4_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_4_train_pca, y_TPC_4_train)

y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test_pca)
accuracy = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_5
scaler = StandardScaler()
X_TPC_5_train_scaled = scaler.fit_transform(X_TPC_5_train)
X_TPC_5_test_scaled = scaler.transform(X_TPC_5_test)

pca = PCA(n_components=0.6)

X_TPC_5_train_pca = pca.fit_transform(X_TPC_5_train_scaled)
X_TPC_5_test_pca = pca.transform(X_TPC_5_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_5_train_pca, y_TPC_5_train)

y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test_pca)
accuracy = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_6
scaler = StandardScaler()
X_TPC_6_train_scaled = scaler.fit_transform(X_TPC_6_train)
X_TPC_6_test_scaled = scaler.transform(X_TPC_6_test)

pca = PCA(n_components=0.6)

X_TPC_6_train_pca = pca.fit_transform(X_TPC_6_train_scaled)
X_TPC_6_test_pca = pca.transform(X_TPC_6_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_6_train_pca, y_TPC_6_train)

y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test_pca)
accuracy = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_7
scaler = StandardScaler()
X_TPC_7_train_scaled = scaler.fit_transform(X_TPC_7_train)
X_TPC_7_test_scaled = scaler.transform(X_TPC_7_test)

pca = PCA(n_components=0.6)

X_TPC_7_train_pca = pca.fit_transform(X_TPC_7_train_scaled)
X_TPC_7_test_pca = pca.transform(X_TPC_7_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_7_train_pca, y_TPC_7_train)

y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test_pca)
accuracy = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy * 100:.2f}%')

Model Accuracy(AAC): 65.45%
Model Accuracy(DPC): 63.64%
Model Accuracy(TPC): 61.82%
Model Accuracy(combination): 52.73%
Model Accuracy(TPC_4): 50.91%
Model Accuracy(TPC_5): 61.82%
Model Accuracy(TPC_6): 56.36%
Model Accuracy(TPC_7): 56.36%


In [229]:
from sklearn.feature_selection import RFE

scaler = StandardScaler()

rf_classifier = RandomForestClassifier()


# -------------------------------------------------
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe.transform(X_test_scaled)

rf_classifier.fit(X_train_rfe, y_train)
y_pred = rf_classifier.predict(X_test_rfe)

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# -------------------------------------------------
X_DPC_train_scaled = scaler.fit_transform(X_DPC_train)
X_DPC_test_scaled = scaler.transform(X_DPC_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_DPC_train_rfe = rfe.fit_transform(X_DPC_train_scaled, y_DPC_train)
X_DPC_test_rfe = rfe.transform(X_DPC_test_scaled)

rf_classifier.fit(X_DPC_train_rfe, y_DPC_train)
y_DPC_pred = rf_classifier.predict(X_DPC_test_rfe)

accuracy = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
X_TPC_train_scaled = scaler.fit_transform(X_TPC_train)
X_TPC_test_scaled = scaler.transform(X_TPC_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_train_rfe = rfe.fit_transform(X_TPC_train_scaled, y_TPC_train)
X_TPC_test_rfe = rfe.transform(X_TPC_test_scaled)

rf_classifier.fit(X_TPC_train_rfe, y_TPC_train)
y_TPC_pred = rf_classifier.predict(X_TPC_test_rfe)

accuracy = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
X_combination_train_scaled = scaler.fit_transform(X_combination_train)
X_combination_test_scaled = scaler.transform(X_combination_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_combination_train_rfe = rfe.fit_transform(X_combination_train_scaled, y_combination_train)
X_combination_test_rfe = rfe.transform(X_combination_test_scaled)

rf_classifier.fit(X_combination_train_rfe, y_combination_train)
y_combination_pred = rf_classifier.predict(X_combination_test_rfe)

accuracy = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_4
X_TPC_4_train_scaled = scaler.fit_transform(X_TPC_4_train)
X_TPC_4_test_scaled = scaler.transform(X_TPC_4_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_4_train_rfe = rfe.fit_transform(X_TPC_4_train_scaled, y_TPC_4_train)
X_TPC_4_test_rfe = rfe.transform(X_TPC_4_test_scaled)

rf_classifier.fit(X_TPC_4_train_rfe, y_TPC_4_train)
y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test_rfe)

accuracy = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_5
X_TPC_5_train_scaled = scaler.fit_transform(X_TPC_5_train)
X_TPC_5_test_scaled = scaler.transform(X_TPC_5_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_5_train_rfe = rfe.fit_transform(X_TPC_5_train_scaled, y_TPC_5_train)
X_TPC_5_test_rfe = rfe.transform(X_TPC_5_test_scaled)

rf_classifier.fit(X_TPC_5_train_rfe, y_TPC_5_train)
y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test_rfe)

accuracy = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_6
X_TPC_6_train_scaled = scaler.fit_transform(X_TPC_6_train)
X_TPC_6_test_scaled = scaler.transform(X_TPC_6_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_6_train_rfe = rfe.fit_transform(X_TPC_6_train_scaled, y_TPC_6_train)
X_TPC_6_test_rfe = rfe.transform(X_TPC_6_test_scaled)

rf_classifier.fit(X_TPC_6_train_rfe, y_TPC_6_train)
y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test_rfe)

accuracy = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_7
X_TPC_7_train_scaled = scaler.fit_transform(X_TPC_7_train)
X_TPC_7_test_scaled = scaler.transform(X_TPC_7_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_7_train_rfe = rfe.fit_transform(X_TPC_7_train_scaled, y_TPC_7_train)
X_TPC_7_test_rfe = rfe.transform(X_TPC_7_test_scaled)

rf_classifier.fit(X_TPC_7_train_rfe, y_TPC_7_train)
y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test_rfe)

accuracy = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy * 100:.2f}%')

Model Accuracy(AAC): 76.36%
Model Accuracy(DPC): 76.36%
Model Accuracy(TPC): 61.82%
Model Accuracy(combination): 54.55%
Model Accuracy(TPC_4): 58.18%
Model Accuracy(TPC_5): 63.64%
Model Accuracy(TPC_6): 74.55%
Model Accuracy(TPC_7): 69.09%


In [10]:
from scipy.stats import skew, kurtosis
from collections import Counter
from sklearn.preprocessing import StandardScaler

# 計算字符出現的概率
def calculate_probabilities(sequence):
    length = len(sequence)
    probabilities = {char: count / length for char, count in Counter(sequence).items()}
    return probabilities

# 計算熵
def calculate_entropy(sequence):
    probabilities = calculate_probabilities(sequence)
    entropy = -sum(p * (p and p != 1 and p != 0) * (p and p != 1 and p != 0) * (p and p != 1 and p != 0) for p in probabilities.values())
    return entropy

# 計算偏度
def calculate_skewness(sequence):
    frequencies = list(calculate_probabilities(sequence).values())
    skewness = skew(frequencies)
    return skewness

# 計算峰度
def calculate_kurtosis(sequence):
    frequencies = list(calculate_probabilities(sequence).values())
    kurt = kurtosis(frequencies)
    return kurt

X_train_with_stats = X_train.copy()
X_test_with_stats = X_test.copy()
X_DPC_train_with_stats = X_DPC_train.copy()
X_DPC_test_with_stats = X_DPC_test.copy()
X_TPC_train_with_stats = X_TPC_train.copy()
X_TPC_test_with_stats = X_TPC_test.copy()

# 對每個序列計算熵、偏度和峰度
X_train_with_stats['Entropy'] = bench_train['Sequence'].apply(calculate_entropy)
X_train_with_stats['Skewness'] = bench_train['Sequence'].apply(calculate_skewness)
X_train_with_stats['Kurtosis'] = bench_train['Sequence'].apply(calculate_kurtosis)
X_test_with_stats['Entropy'] = bench_test['Sequence'].apply(calculate_entropy)
X_test_with_stats['Skewness'] = bench_test['Sequence'].apply(calculate_skewness)
X_test_with_stats['Kurtosis'] = bench_test['Sequence'].apply(calculate_kurtosis)

X_DPC_train_with_stats['Entropy'] = bench_train['Sequence'].apply(calculate_entropy)
X_DPC_train_with_stats['Skewness'] = bench_train['Sequence'].apply(calculate_skewness)
X_DPC_train_with_stats['Kurtosis'] = bench_train['Sequence'].apply(calculate_kurtosis)
X_DPC_test_with_stats['Entropy'] = bench_test['Sequence'].apply(calculate_entropy)
X_DPC_test_with_stats['Skewness'] = bench_test['Sequence'].apply(calculate_skewness)
X_DPC_test_with_stats['Kurtosis'] = bench_test['Sequence'].apply(calculate_kurtosis)

X_TPC_train_with_stats['Entropy'] = bench_train['Sequence'].apply(calculate_entropy)
X_TPC_train_with_stats['Skewness'] = bench_train['Sequence'].apply(calculate_skewness)
X_TPC_train_with_stats['Kurtosis'] = bench_train['Sequence'].apply(calculate_kurtosis)
X_TPC_test_with_stats['Entropy'] = bench_test['Sequence'].apply(calculate_entropy)
X_TPC_test_with_stats['Skewness'] = bench_test['Sequence'].apply(calculate_skewness)
X_TPC_test_with_stats['Kurtosis'] = bench_test['Sequence'].apply(calculate_kurtosis)

# 如果熵、偏度和峰度為 NaN，則用平均值填充
X_train_with_stats['Entropy'] = X_train_with_stats['Entropy'].fillna(X_train_with_stats['Entropy'].mean())
X_train_with_stats['Skewness'] = X_train_with_stats['Skewness'].fillna(X_train_with_stats['Skewness'].mean())
X_train_with_stats['Kurtosis'] = X_train_with_stats['Kurtosis'].fillna(X_train_with_stats['Kurtosis'].mean())
X_test_with_stats['Entropy'] = X_test_with_stats['Entropy'].fillna(X_test_with_stats['Entropy'].mean())
X_test_with_stats['Skewness'] = X_test_with_stats['Skewness'].fillna(X_test_with_stats['Skewness'].mean())
X_test_with_stats['Kurtosis'] = X_test_with_stats['Kurtosis'].fillna(X_test_with_stats['Kurtosis'].mean())

X_DPC_train_with_stats['Entropy'] = X_DPC_train_with_stats['Entropy'].fillna(X_DPC_train_with_stats['Entropy'].mean())
X_DPC_train_with_stats['Skewness'] = X_DPC_train_with_stats['Skewness'].fillna(X_DPC_train_with_stats['Skewness'].mean())
X_DPC_train_with_stats['Kurtosis'] = X_DPC_train_with_stats['Kurtosis'].fillna(X_DPC_train_with_stats['Kurtosis'].mean())
X_DPC_test_with_stats['Entropy'] = X_DPC_test_with_stats['Entropy'].fillna(X_DPC_test_with_stats['Entropy'].mean())
X_DPC_test_with_stats['Skewness'] = X_DPC_test_with_stats['Skewness'].fillna(X_DPC_test_with_stats['Skewness'].mean())
X_DPC_test_with_stats['Kurtosis'] = X_DPC_test_with_stats['Kurtosis'].fillna(X_DPC_test_with_stats['Kurtosis'].mean())

X_TPC_train_with_stats['Entropy'] = X_TPC_train_with_stats['Entropy'].fillna(X_TPC_train_with_stats['Entropy'].mean())
X_TPC_train_with_stats['Skewness'] = X_TPC_train_with_stats['Skewness'].fillna(X_TPC_train_with_stats['Skewness'].mean())
X_TPC_train_with_stats['Kurtosis'] = X_TPC_train_with_stats['Kurtosis'].fillna(X_TPC_train_with_stats['Kurtosis'].mean())
X_TPC_test_with_stats['Entropy'] = X_TPC_test_with_stats['Entropy'].fillna(X_TPC_test_with_stats['Entropy'].mean())
X_TPC_test_with_stats['Skewness'] = X_TPC_test_with_stats['Skewness'].fillna(X_TPC_test_with_stats['Skewness'].mean())
X_TPC_test_with_stats['Kurtosis'] = X_TPC_test_with_stats['Kurtosis'].fillna(X_TPC_test_with_stats['Kurtosis'].mean())


# -------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_with_stats)
X_test_scaled = scaler.transform(X_test_with_stats)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train)

y_pred = rf_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_DPC_train_scaled = scaler.fit_transform(X_DPC_train_with_stats)
X_DPC_test_scaled = scaler.transform(X_DPC_test_with_stats)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_DPC_train_scaled, y_DPC_train)

y_DPC_pred = rf_classifier.predict(X_DPC_test_scaled)
accuracy = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_TPC_train_scaled = scaler.fit_transform(X_TPC_train_with_stats)
X_TPC_test_scaled = scaler.transform(X_TPC_test_with_stats)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_train_scaled, y_TPC_train)

y_TPC_pred = rf_classifier.predict(X_TPC_test_scaled)
accuracy = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy * 100:.2f}%')

Model Accuracy(AAC): 69.09%
Model Accuracy(DPC): 63.64%
Model Accuracy(TPC): 61.82%


In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import csv
from sklearn.svm import SVC
from itertools import product
from sklearn.metrics import accuracy_score, confusion_matrix

with open('NT15dataset_train.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
nt15_train = pd.DataFrame(data)

print(nt15_train)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix_train = np.zeros((len(sequences), len(amino_acids)))
percentage_matrix_train = []

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix_train[i, j] = sequence.count(amino_acid)
    
    percentage_matrix_train.append(count_matrix_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_train = np.array(percentage_matrix_train)
print('percentage_train in 1-D:')
print(percentage_matrix_train)

print()
# 顯示比例矩陣
df_percentage_train = pd.DataFrame(percentage_matrix_train, columns=list(amino_acids))
print(df_percentage_train)

# 合併特徵資料和目標資料
df_combined_train = pd.concat([df_percentage_train, nt15_train['Header']], axis=1)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_train['Target'] = df_combined_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_train, y_train = df_combined_train.drop(['Header', 'Target'], axis=1), df_combined_train['Target']

#print(X_train)
#print(y_train)

#--------------------------------------------------------------
# 創建所有二個胺基酸相連的標籤
DPC = [a + b for a in amino_acids for b in amino_acids]

# 計算每對胺基酸的出現次數
count_DPC_train = np.zeros((len(sequences), len(DPC)))
percentage_DPC_train = []

for i, sequence in enumerate(sequences):
    for j, aa_pair in enumerate(DPC):
        count_DPC_train[i, j] = sequence.count(aa_pair)
        
    percentage_DPC_train.append(count_DPC_train[i]/15.)

# 計算每對胺基酸在序列中的比例
percentage_DPC_train = np.array(percentage_DPC_train)
print('percentage_DPC_train in 1-D:')
print(percentage_DPC_train)

# 顯示比例矩陣
df_percentage_DPC_train = pd.DataFrame(percentage_DPC_train, columns=DPC)
print(df_percentage_DPC_train)

# 合併特徵資料和目標資料
df_combined_DPC_train = pd.concat([df_percentage_train, df_percentage_DPC_train, nt15_train['Header']], axis=1)
#df_combined_DPC_train = pd.concat([df_percentage_DPC_train, nt15_train['Header']], axis=1)
print(df_combined_DPC_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_DPC_train['Target'] = df_combined_DPC_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_DPC_train, y_DPC_train = df_combined_DPC_train.drop(['Header', 'Target'], axis=1), df_combined_DPC_train['Target']
#print(X_DPC_train)

#--------------------------------------------------------------
# 產生所有三種胺基酸的排列組合
TPC = [''.join(comb) for comb in product(amino_acids, repeat=3)]

# 計算每種胺基酸的出現次數
count_TPC_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_train = []

for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_train[i, j] = sequence.count(amino_acid_combination)
        
    percentage_TPC_train.append(count_TPC_train[i]/15.)

# 計算每種胺基酸在序列中的比例
percentage_TPC_train = np.array(percentage_TPC_train)
print('percentage_TPC_train in 1-D:')
print(percentage_TPC_train)

# 顯示比例矩陣
df_percentage_TPC_train = pd.DataFrame(percentage_TPC_train, columns=TPC)
print(df_percentage_TPC_train)

# 合併特徵資料和目標資料
df_combined_TPC_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_train, nt15_train['Header']], axis=1)
print(df_combined_TPC_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_train['Target'] = df_combined_TPC_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_train, y_TPC_train = df_combined_TPC_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_train['Target']


#--------------------------------------------------------------
# 將DPC抓到比較好的2個胺基酸加進DPC的feature

# 初始化 CN 和 CS 的特徵矩陣
combinations_CN = ['CN' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CN' for comb in product(amino_acids, repeat=1)]
combinations_CS = ['CS' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CS' for comb in product(amino_acids, repeat=1)]

all_combinations = combinations_CN + combinations_CS

count_matrix_total = np.zeros((len(sequences), len(all_combinations)))
percentage_matrix_total = []

for i, sequence in enumerate(sequences):
    for j, combo in enumerate(all_combinations):
        count_matrix_total[i, j] = sequence.count(combo)
    
    percentage_matrix_total.append(count_matrix_total[i] / len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_total = np.array(percentage_matrix_total)
print('percentage_combination_train in 1-D:')
print(percentage_matrix_total)

df_percentage_total = pd.DataFrame(percentage_matrix_total, columns=all_combinations)
print(df_percentage_total)

# 將 CN、CS 開頭的特徵和 DPC 特徵合併
df_combined_total = pd.concat([df_percentage_total, df_percentage_DPC_train, nt15_train['Header']], axis=1)
print(df_combined_total)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_total['Target'] = df_combined_total['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_combination_train, y_combination_train = df_combined_total.drop(['Header', 'Target'], axis=1), df_combined_total['Target']

# --------------------------------------------------------------
# 產生所有TPC穿插一個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_4_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_4_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_4_train[i, j] = 0
        for k in amino_acid :
            # 看起來很愚蠢但這是最不吃效能的方法
            count_TPC_4_train[i, j] += sequence.count( k + amino_acid_combination )
            count_TPC_4_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] )
            count_TPC_4_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] )
            count_TPC_4_train[i, j] += sequence.count( amino_acid_combination + k )
    percentage_TPC_4_train.append(count_TPC_4_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_4_train = np.array(percentage_TPC_4_train)

# 顯示比例矩陣
df_percentage_TPC_4_train = pd.DataFrame(percentage_TPC_4_train, columns=TPC)
print('TPC_4:')
print(df_percentage_TPC_4_train)

# 合併特徵資料和目標資料
df_combined_TPC_4_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_4_train, nt15_train['Header']], axis=1)
print('AAC + DPC + TPC_4:')
print(df_combined_TPC_4_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_4_train['Target'] = df_combined_TPC_4_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_4_train, y_TPC_4_train = df_combined_TPC_4_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_4_train['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插兩個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_5_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_5_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_5_train[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                count_TPC_5_train[i, j] += sequence.count( k + l + amino_acid_combination )
                count_TPC_5_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] )
                count_TPC_5_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] )
                count_TPC_5_train[i, j] += sequence.count( k + amino_acid_combination + l )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2:] )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0:1] + k + l + amino_acid_combination[2] )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination[0:1] + k + amino_acid_combination[2] + l )
                count_TPC_5_train[i, j] += sequence.count( amino_acid_combination + k + l )
    percentage_TPC_5_train.append(count_TPC_5_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_5_train = np.array(percentage_TPC_5_train)

# 顯示比例矩陣
df_percentage_TPC_5_train = pd.DataFrame(percentage_TPC_5_train, columns=TPC)
print('TPC_5:')
print(df_percentage_TPC_5_train)

# 合併特徵資料和目標資料
df_combined_TPC_5_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_5_train, nt15_train['Header']], axis=1)
print('AAC + DPC + TPC_5:')
print(df_combined_TPC_5_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_5_train['Target'] = df_combined_TPC_5_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_5_train, y_TPC_5_train = df_combined_TPC_5_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_5_train['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_6_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_6_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_6_train[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    count_TPC_6_train[i, j] += sequence.count( k + l + m + amino_acid_combination )
                    count_TPC_6_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] )
                    count_TPC_6_train[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( k + l + amino_acid_combination + m )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_train[i, j] += sequence.count( k + amino_acid_combination + l + m )

                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m )

                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m )
                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m )

                    count_TPC_6_train[i, j] += sequence.count( amino_acid_combination + k + l + m )
    percentage_TPC_6_train.append(count_TPC_6_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_6_train = np.array(percentage_TPC_6_train)

# 顯示比例矩陣hg
df_percentage_TPC_6_train = pd.DataFrame(percentage_TPC_6_train, columns=TPC)
print('TPC_6:')
print(df_percentage_TPC_6_train)

# 合併特徵資料和目標資料
df_combined_TPC_6_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_6_train, nt15_train['Header']], axis=1)
print('AAC + DPC + TPC_6:')
print(df_combined_TPC_6_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_6_train['Target'] = df_combined_TPC_6_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_6_train, y_TPC_6_train = df_combined_TPC_6_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_6_train['Target']


# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_7_train = np.zeros((len(sequences), len(TPC)))
percentage_TPC_7_train = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_7_train[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    for n in amino_acid :
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + n + amino_acid_combination )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + n + amino_acid_combination[1:] )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + n + amino_acid_combination[2] )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination + k + l + m + n )
                        
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + amino_acid_combination[0] + n + amino_acid_combination[1:])
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + amino_acid_combination[:1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + l + m + amino_acid_combination + n )
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + n + amino_acid_combination[1:])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination + l + m + n )
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m + n)
                        
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + n + amino_acid_combination[1:])
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m + n)
                        
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] + n)
                        count_TPC_7_train[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_train[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
                        count_TPC_7_train[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
    percentage_TPC_7_train.append(count_TPC_7_train[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_7_train = np.array(percentage_TPC_7_train)

# 顯示比例矩陣hg
df_percentage_TPC_7_train = pd.DataFrame(percentage_TPC_7_train, columns=TPC)
print('TPC_7:')
print(df_percentage_TPC_7_train)

# 合併特徵資料和目標資料
df_combined_TPC_7_train = pd.concat([df_percentage_train, df_percentage_DPC_train, df_percentage_TPC_7_train, nt15_train['Header']], axis=1)
print('AAC + DPC + TPC_7:')
print(df_combined_TPC_7_train)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_7_train['Target'] = df_combined_TPC_7_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_7_train, y_TPC_7_train = df_combined_TPC_7_train.drop(['Header', 'Target'], axis=1), df_combined_TPC_7_train['Target']

      Header         Sequence
0      >AA26  EKYEGKISKTMSGLD
1      >AA27  ESLARPCAPGAPAEA
2      >AA28  FCNINNVCNFASRND
3      >AA29  FLKDHRISTFKNWPF
4      >AA30  FLSSRLQDLYSIVRR
..       ...              ...
155  >neg131  VVRLAREPGKRESRY
156  >neg132  YEDLRDESLKGLVDI
157  >neg133  YFLIQSVSSTVMLLN
158  >neg134  YGEPGMQLFVYGREE
159  >neg135  YNLSDTIKAFSILLL

[160 rows x 2 columns]

percentage_train in 1-D:
[[0.         0.         0.06666667 ... 0.         0.         0.06666667]
 [0.33333333 0.06666667 0.         ... 0.         0.         0.        ]
 [0.06666667 0.13333333 0.06666667 ... 0.06666667 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.13333333 0.         0.06666667]
 [0.         0.         0.         ... 0.06666667 0.         0.13333333]
 [0.06666667 0.         0.06666667 ... 0.         0.         0.06666667]]

            A         C         D         E         F         G         H  \
0    0.000000  0.000000  0.066667  0.133333  0.000000  0.133333  0.0

TPC_4:
     AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  \
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
155  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
156  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
157  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
158  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
159  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

     YYQ  YYR  YYS  YYT  YYV  YYW  YYY  
0    0.0  0.0  0.0  0.0  0.

TPC_7:
     AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  \
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
155  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
156  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
157  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
158  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
159  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

     YYQ  YYR  YYS  YYT  YYV  YYW  YYY  
0    0.0  0.0  0.0  0.0  0.

In [3]:
with open('NT15dataset_test.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
nt15_test = pd.DataFrame(data)

#print(nt15_test)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix_test = np.zeros((len(sequences), len(amino_acids)))
percentage_matrix_test = []

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix_test[i, j] = sequence.count(amino_acid)
    
    percentage_matrix_test.append(count_matrix_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_test = np.array(percentage_matrix_test)
print('percentage_test in 1-D:')
print(percentage_matrix_test)

print()
# 顯示比例矩陣
df_percentage_test = pd.DataFrame(percentage_matrix_test, columns=list(amino_acids))
print(df_percentage_test)

# 合併特徵資料和目標資料
df_combined_test = pd.concat([df_percentage_test, nt15_test['Header']], axis=1)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_test['Target'] = df_combined_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_test, y_test = df_combined_test.drop(['Header', 'Target'], axis=1), df_combined_test['Target']

#print(X_test)
#print(y_test)

#--------------------------------------------------------------
# 創建所有二個胺基酸相連的標籤
DPC = [a + b for a in amino_acids for b in amino_acids]

# 計算每對胺基酸的出現次數
count_DPC_test = np.zeros((len(sequences), len(DPC)))
percentage_DPC_test = []

for i, sequence in enumerate(sequences):
    for j, aa_pair in enumerate(DPC):
        count_DPC_test[i, j] = sequence.count(aa_pair)
        
    percentage_DPC_test.append(count_DPC_test[i]/15.)

# 計算每對胺基酸在序列中的比例
percentage_DPC_test = np.array(percentage_DPC_test)
print('percentage_DPC_test in 1-D:')
print(percentage_DPC_test)

# 顯示比例矩陣
df_percentage_DPC_test = pd.DataFrame(percentage_DPC_test, columns=DPC)
print(df_percentage_DPC_test)

# 合併特徵資料和目標資料
df_combined_DPC_test = pd.concat([df_percentage_test, df_percentage_DPC_test, nt15_test['Header']], axis=1)
#df_combined_DPC_test = pd.concat([df_percentage_DPC_test, nt15_test['Header']], axis=1)
print(df_combined_DPC_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_DPC_test['Target'] = df_combined_DPC_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_DPC_test, y_DPC_test = df_combined_DPC_test.drop(['Header', 'Target'], axis=1), df_combined_DPC_test['Target']
#print(X_DPC_train)

#--------------------------------------------------------------
# 產生所有三種胺基酸的排列組合
TPC = [''.join(comb) for comb in product(amino_acids, repeat=3)]

# 計算每種胺基酸的出現次數
count_TPC_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_test = []

for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_test[i, j] = sequence.count(amino_acid_combination)
        
    percentage_TPC_test.append(count_TPC_test[i]/15.)

# 計算每種胺基酸在序列中的比例
percentage_TPC_test = np.array(percentage_TPC_test)
print('percentage_TPC_test in 1-D:')
print(percentage_TPC_test)

# 顯示比例矩陣
df_percentage_TPC_test = pd.DataFrame(percentage_TPC_test, columns=TPC)
print(df_percentage_TPC_test)

# 合併特徵資料和目標資料
df_combined_TPC_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_test, nt15_test['Header']], axis=1)
print(df_combined_TPC_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_test['Target'] = df_combined_TPC_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_test, y_TPC_test = df_combined_TPC_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_test['Target']


#--------------------------------------------------------------
# 將DPC抓到比較好的2個胺基酸加進DPC的feature

# 初始化 CN 和 CS 的特徵矩陣
combinations_CN = ['CN' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CN' for comb in product(amino_acids, repeat=1)]
combinations_CS = ['CS' + ''.join(comb) for comb in product(amino_acids, repeat=1)] + [''.join(comb) + 'CS' for comb in product(amino_acids, repeat=1)]

all_combinations = combinations_CN + combinations_CS

count_matrix_total = np.zeros((len(sequences), len(all_combinations)))
percentage_matrix_total = []

for i, sequence in enumerate(sequences):
    for j, combo in enumerate(all_combinations):
        count_matrix_total[i, j] = sequence.count(combo)
    
    percentage_matrix_total.append(count_matrix_total[i] / len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_matrix_total = np.array(percentage_matrix_total)
print('percentage_combination_test in 1-D:')
print(percentage_matrix_total)

df_percentage_total = pd.DataFrame(percentage_matrix_total, columns=all_combinations)
print(df_percentage_total)

# 將 CN、CS 開頭的特徵和 DPC 特徵合併
df_combined_total = pd.concat([df_percentage_total, df_percentage_DPC_test, nt15_test['Header']], axis=1)
print(df_combined_total)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_total['Target'] = df_combined_total['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_combination_test, y_combination_test = df_combined_total.drop(['Header', 'Target'], axis=1), df_combined_total['Target']

# --------------------------------------------------------------
# 產生所有TPC穿插一個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_4_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_4_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_4_test[i, j] = 0
        for k in amino_acid :
            # 看起來很愚蠢但這是最不吃效能的方法
            count_TPC_4_test[i, j] += sequence.count( k + amino_acid_combination )
            count_TPC_4_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] )
            count_TPC_4_test[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] )
            count_TPC_4_test[i, j] += sequence.count( amino_acid_combination + k )
    percentage_TPC_4_test.append(count_TPC_4_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_4_test = np.array(percentage_TPC_4_test)

# 顯示比例矩陣
df_percentage_TPC_4_test = pd.DataFrame(percentage_TPC_4_test, columns=TPC)
print('TPC_4:')
print(df_percentage_TPC_4_test)

# 合併特徵資料和目標資料
df_combined_TPC_4_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_4_test, nt15_test['Header']], axis=1)
print('AAC + DPC + TPC_4:')
print(df_combined_TPC_4_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_4_test['Target'] = df_combined_TPC_4_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_4_test, y_TPC_4_test = df_combined_TPC_4_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_4_test['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插兩個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_5_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_5_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_5_test[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                count_TPC_5_test[i, j] += sequence.count( k + l + amino_acid_combination )
                count_TPC_5_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] )
                count_TPC_5_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] )
                count_TPC_5_test[i, j] += sequence.count( k + amino_acid_combination + l )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2:] )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0:1] + k + l + amino_acid_combination[2] )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination[0:1] + k + amino_acid_combination[2] + l )
                count_TPC_5_test[i, j] += sequence.count( amino_acid_combination + k + l )
    percentage_TPC_5_test.append(count_TPC_5_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_5_test = np.array(percentage_TPC_5_test)

# 顯示比例矩陣
df_percentage_TPC_5_test = pd.DataFrame(percentage_TPC_5_test, columns=TPC)
print('TPC_5:')
print(df_percentage_TPC_5_test)

# 合併特徵資料和目標資料
df_combined_TPC_5_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_5_test, nt15_test['Header']], axis=1)
print('AAC + DPC + TPC_5:')
print(df_combined_TPC_5_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_5_test['Target'] = df_combined_TPC_5_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_5_test, y_TPC_5_test = df_combined_TPC_5_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_5_test['Target']



# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_6_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_6_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_6_test[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    count_TPC_6_test[i, j] += sequence.count( k + l + m + amino_acid_combination )
                    count_TPC_6_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] )
                    count_TPC_6_test[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( k + l + amino_acid_combination + m )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination[0:1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_test[i, j] += sequence.count( k + amino_acid_combination + l + m )

                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m )

                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m )
                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m )

                    count_TPC_6_test[i, j] += sequence.count( amino_acid_combination + k + l + m )
    percentage_TPC_6_test.append(count_TPC_6_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_6_test = np.array(percentage_TPC_6_test)

# 顯示比例矩陣hg
df_percentage_TPC_6_test = pd.DataFrame(percentage_TPC_6_test, columns=TPC)
print('TPC_6:')
print(df_percentage_TPC_6_test)

# 合併特徵資料和目標資料
df_combined_TPC_6_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_6_test, nt15_test['Header']], axis=1)
print('AAC + DPC + TPC_6:')
print(df_combined_TPC_6_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_6_test['Target'] = df_combined_TPC_6_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_6_test, y_TPC_6_test = df_combined_TPC_6_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_6_test['Target']


# --------------------------------------------------------------
# 產生所有TPC穿插三個胺基酸的可能

# 計算每種胺基酸的出現次數
count_TPC_7_test = np.zeros((len(sequences), len(TPC)))
percentage_TPC_7_test = []
for i, sequence in enumerate(sequences):
    for j, amino_acid_combination in enumerate(TPC):
        count_TPC_7_test[i, j] = 0
        for k in amino_acid :
            for l in amino_acid :
                for m in amino_acid :
                    for n in amino_acid :
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + n + amino_acid_combination )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + n + amino_acid_combination[1:] )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + n + amino_acid_combination[2] )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination + k + l + m + n )
                        
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + amino_acid_combination[0] + n + amino_acid_combination[1:])
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + amino_acid_combination[:1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + l + m + amino_acid_combination + n )
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + n + amino_acid_combination[1:])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination + l + m + n )
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1:] + l + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + amino_acid_combination[2] + l + m + n)
                        
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + n + amino_acid_combination[1:])
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[:1] + k + l + amino_acid_combination[2] + m + n)
                        
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[0] + m + amino_acid_combination[1:] + n)
                        count_TPC_7_test[i, j] += sequence.count( k + l + amino_acid_combination[:1] + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1:] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1] + n + amino_acid_combination[2])
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + m + amino_acid_combination[1:] + n)
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[:1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + amino_acid_combination[2] + m + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + amino_acid_combination[1] + l + m + amino_acid_combination[2] + n)
                        count_TPC_7_test[i, j] += sequence.count( amino_acid_combination[0] + k + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
                        count_TPC_7_test[i, j] += sequence.count( k + amino_acid_combination[0] + l + amino_acid_combination[1] + m + amino_acid_combination[2] + n)
                        
    percentage_TPC_7_test.append(count_TPC_7_test[i]/len(sequence))

# 計算每種胺基酸在序列中的比例
percentage_TPC_7_test = np.array(percentage_TPC_7_test)

# 顯示比例矩陣hg
df_percentage_TPC_7_test = pd.DataFrame(percentage_TPC_7_test, columns=TPC)
print('TPC_7:')
print(df_percentage_TPC_7_test)

# 合併特徵資料和目標資料
df_combined_TPC_7_test = pd.concat([df_percentage_test, df_percentage_DPC_test, df_percentage_TPC_7_test, nt15_test['Header']], axis=1)
print('AAC + DPC + TPC_7:')
print(df_combined_TPC_7_test)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_TPC_7_test['Target'] = df_combined_TPC_7_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_TPC_7_test, y_TPC_7_test = df_combined_TPC_7_test.drop(['Header', 'Target'], axis=1), df_combined_TPC_7_test['Target']


percentage_test in 1-D:
[[0.13333333 0.13333333 0.         0.06666667 0.06666667 0.13333333
  0.06666667 0.         0.         0.06666667 0.         0.
  0.06666667 0.13333333 0.06666667 0.         0.06666667 0.
  0.         0.        ]
 [0.06666667 0.         0.         0.         0.06666667 0.
  0.06666667 0.06666667 0.2        0.13333333 0.06666667 0.06666667
  0.         0.06666667 0.06666667 0.06666667 0.         0.06666667
  0.         0.        ]
 [0.33333333 0.         0.         0.06666667 0.         0.
  0.         0.         0.2        0.         0.         0.
  0.06666667 0.06666667 0.13333333 0.         0.06666667 0.06666667
  0.         0.        ]
 [0.2        0.13333333 0.         0.         0.         0.2
  0.         0.         0.         0.         0.         0.
  0.         0.         0.06666667 0.26666667 0.         0.06666667
  0.06666667 0.        ]
 [0.13333333 0.13333333 0.         0.06666667 0.06666667 0.13333333
  0.06666667 0.06666667 0.         0.         

    CNA  CNC  CND  CNE  CNF  CNG  CNH  CNI  CNK  CNL  ...   YN   YP   YQ   YR  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.0  0.0 

TPC_5:
    AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  YYQ  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.

TPC_6:
    AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  YYQ  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.

TPC_7:
    AAA  AAC  AAD  AAE  AAF  AAG  AAH  AAI  AAK  AAL  ...  YYM  YYN  YYP  YYQ  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.

In [52]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()

# Single
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# DPC
rf_classifier.fit(X_DPC_train, y_DPC_train)
y_DPC_pred = rf_classifier.predict(X_DPC_test)

accuracy_DPC = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy_DPC * 100:.2f}%')


# TPC
rf_classifier.fit(X_combination_train, y_combination_train)
y_combination_pred = rf_classifier.predict(X_combination_test)

accuracy_combination = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy_combination * 100:.2f}%')


# TPC
rf_classifier.fit(X_TPC_train, y_TPC_train)
y_TPC_pred = rf_classifier.predict(X_TPC_test)

accuracy_TPC = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy_TPC * 100:.2f}%')


# TPC_4
rf_classifier.fit(X_TPC_4_train, y_TPC_4_train)
y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test)

accuracy_TPC_4 = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy_TPC_4 * 100:.2f}%')

# TPC_5
rf_classifier.fit(X_TPC_5_train, y_TPC_5_train)
y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test)

accuracy_TPC_5 = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy_TPC_5 * 100:.2f}%')

# TPC_6
rf_classifier.fit(X_TPC_6_train, y_TPC_6_train)
y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test)

accuracy_TPC_6 = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy_TPC_6 * 100:.2f}%')

# TPC_7
rf_classifier.fit(X_TPC_7_train, y_TPC_7_train)
y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test)

accuracy_TPC_7 = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy_TPC_7 * 100:.2f}%')

Model Accuracy(AAC): 72.50%
Model Accuracy(DPC): 75.00%
Model Accuracy(combination): 67.50%
Model Accuracy(TPC): 70.00%
Model Accuracy(TPC_4): 67.50%
Model Accuracy(TPC_5): 67.50%
Model Accuracy(TPC_6): 72.50%
Model Accuracy(TPC_7): 72.50%


In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

rf_classifier = RandomForestClassifier()


# --------------------------------------------------------------
rf_classifier.fit(X_train, y_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_train, y_train)

X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

selected_features = X_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("AAC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_train_selected, y_train)

y_pred = rf_classifier.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')



# --------------------------------------------------------------
rf_classifier.fit(X_DPC_train, y_DPC_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_DPC_train, y_DPC_train)

X_DPC_train_selected = sfm.transform(X_DPC_train)
X_DPC_test_selected = sfm.transform(X_DPC_test)

selected_features = X_DPC_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("DPC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_DPC_train_selected, y_DPC_train)

y_DPC_pred = rf_classifier.predict(X_DPC_test_selected)
accuracy_DPC = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy_DPC * 100:.2f}%')


# --------------------------------------------------------------
rf_classifier.fit(X_TPC_train, y_TPC_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_train, y_TPC_train)

X_TPC_train_selected = sfm.transform(X_TPC_train)
X_TPC_test_selected = sfm.transform(X_TPC_test)

selected_features = X_TPC_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_train_selected, y_TPC_train)

y_TPC_pred = rf_classifier.predict(X_TPC_test_selected)
accuracy_TPC = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy_TPC * 100:.2f}%')


# --------------------------------------------------------------
rf_classifier.fit(X_combination_train, y_combination_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_combination_train, y_combination_train)

X_combination_train_selected = sfm.transform(X_combination_train)
X_combination_test_selected = sfm.transform(X_combination_test)

selected_features = X_combination_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("combination Selected Features:")
# print(selected_features)

rf_classifier.fit(X_combination_train_selected, y_combination_train)

y_combination_pred = rf_classifier.predict(X_combination_test_selected)
accuracy_combination = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy_combination * 100:.2f}%')

# --------------------------------------------------------------
# TPC_4
rf_classifier.fit(X_TPC_4_train, y_TPC_4_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_4_train, y_TPC_4_train)

X_TPC_4_train_selected = sfm.transform(X_TPC_4_train)
X_TPC_4_test_selected = sfm.transform(X_TPC_4_test)

selected_features = X_TPC_4_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_4 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_4_train_selected, y_TPC_4_train)

y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test_selected)
accuracy_TPC_4 = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy_TPC_4 * 100:.2f}%')


# --------------------------------------------------------------
# TPC_5
rf_classifier.fit(X_TPC_5_train, y_TPC_5_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_5_train, y_TPC_5_train)

X_TPC_5_train_selected = sfm.transform(X_TPC_5_train)
X_TPC_5_test_selected = sfm.transform(X_TPC_5_test)

selected_features = X_TPC_5_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_5 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_5_train_selected, y_TPC_5_train)

y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test_selected)
accuracy_TPC_5 = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy_TPC_5 * 100:.2f}%')


# --------------------------------------------------------------
# TPC_6
rf_classifier.fit(X_TPC_6_train, y_TPC_6_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_6_train, y_TPC_6_train)

X_TPC_6_train_selected = sfm.transform(X_TPC_6_train)
X_TPC_6_test_selected = sfm.transform(X_TPC_6_test)

selected_features = X_TPC_6_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_6 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_6_train_selected, y_TPC_6_train)

y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test_selected)
accuracy_TPC_6 = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy_TPC_6 * 100:.2f}%')


# --------------------------------------------------------------
# TPC_7
rf_classifier.fit(X_TPC_7_train, y_TPC_7_train)
#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_7_train, y_TPC_7_train)

X_TPC_7_train_selected = sfm.transform(X_TPC_7_train)
X_TPC_7_test_selected = sfm.transform(X_TPC_7_test)

selected_features = X_TPC_7_train.columns[sfm.get_support()]

# 印出篩選後的特徵
# print("TPC_7 Selected Features:")
# print(selected_features)

rf_classifier.fit(X_TPC_7_train_selected, y_TPC_7_train)

y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test_selected)
accuracy_TPC_7 = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy_TPC_7 * 100:.2f}%')

Model Accuracy(AAC): 70.00%
Model Accuracy(DPC): 80.00%
Model Accuracy(TPC): 72.50%
Model Accuracy(combination): 62.50%
Model Accuracy(TPC_4): 70.00%
Model Accuracy(TPC_5): 80.00%
Model Accuracy(TPC_6): 75.00%
Model Accuracy(TPC_7): 67.50%


In [252]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
import time

# 定義參數範圍
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_classifier = RandomForestClassifier()
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
execution_time = end_time - start_time
print(f"Grid Search Execution Time: {execution_time:.2f} seconds")

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_train, y_train)
selected_features = X_train.columns[sfm.get_support()]

X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

# 印出篩選後的特徵
print("Selected Features:", selected_features)

# 將模型擬合到篩選後的訓練數據上
best_model.fit(X_train_selected, y_train)

y_pred = best_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')

print('-------------------------------------------------')
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
execution_time = end_time - start_time
print(f"Grid Search Execution Time: {execution_time:.2f} seconds")

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_DPC_train, y_DPC_train)
selected_features = X_DPC_train.columns[sfm.get_support()]

X_DPC_train_selected = sfm.transform(X_DPC_train)
X_DPC_test_selected = sfm.transform(X_DPC_test)

# 印出篩選後的特徵
print("DPC Selected Features:", selected_features)

best_model.fit(X_DPC_train_selected, y_DPC_train)

y_DPC_pred = best_model.predict(X_DPC_test_selected)
accuracy_DPC = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy_DPC * 100:.2f}%')

print('-------------------------------------------------')
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
execution_time = end_time - start_time
print(f"Grid Search Execution Time: {execution_time:.2f} seconds")

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

#feature_importance = rf_classifier.feature_importances_
sfm = SelectFromModel(rf_classifier, threshold=0.01)  # 可以調整閾值
sfm.fit(X_TPC_train, y_TPC_train)
selected_features = X_TPC_train.columns[sfm.get_support()]

X_TPC_train_selected = sfm.transform(X_TPC_train)
X_TPC_test_selected = sfm.transform(X_TPC_test)

# 印出篩選後的特徵
print("TPC Selected Features:", selected_features)

best_model.fit(X_TPC_train_selected, y_TPC_train)

y_TPC_pred = best_model.predict(X_TPC_test_selected)
accuracy_TPC = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy_TPC * 100:.2f}%')

Grid Search Execution Time: 125.87 seconds
Best Parameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Selected Features: Index(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
       'R', 'S', 'T', 'V', 'W', 'Y'],
      dtype='object')
Model Accuracy(AAC): 75.00%
-------------------------------------------------
Grid Search Execution Time: 126.73 seconds
Best Parameters: {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
DPC Selected Features: Index(['C', 'F', 'G', 'I', 'L', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'CG',
       'CN', 'CS', 'DA', 'GV', 'LI', 'PC', 'PW', 'QD', 'SP'],
      dtype='object')
Model Accuracy(DPC): 72.50%
-------------------------------------------------
Grid Search Execution Time: 138.14 seconds
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimator

In [260]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# -------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.6)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_pca, y_train)

y_pred = rf_classifier.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_DPC_train_scaled = scaler.fit_transform(X_DPC_train)
X_DPC_test_scaled = scaler.transform(X_DPC_test)

pca = PCA(n_components=0.6)

X_DPC_train_pca = pca.fit_transform(X_DPC_train_scaled)
X_DPC_test_pca = pca.transform(X_DPC_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_DPC_train_pca, y_DPC_train)

y_DPC_pred = rf_classifier.predict(X_DPC_test_pca)
accuracy = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_TPC_train_scaled = scaler.fit_transform(X_TPC_train)
X_TPC_test_scaled = scaler.transform(X_TPC_test)

pca = PCA(n_components=0.6)

X_TPC_train_pca = pca.fit_transform(X_TPC_train_scaled)
X_TPC_test_pca = pca.transform(X_TPC_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_train_pca, y_TPC_train)

y_TPC_pred = rf_classifier.predict(X_TPC_test_pca)
accuracy = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_combination_train_scaled = scaler.fit_transform(X_combination_train)
X_combination_test_scaled = scaler.transform(X_combination_test)

pca = PCA(n_components=0.6)

X_combination_train_pca = pca.fit_transform(X_combination_train_scaled)
X_combination_test_pca = pca.transform(X_combination_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_combination_train_pca, y_combination_train)

y_combination_pred = rf_classifier.predict(X_combination_test_pca)
accuracy = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_4
scaler = StandardScaler()
X_TPC_4_train_scaled = scaler.fit_transform(X_TPC_4_train)
X_TPC_4_test_scaled = scaler.transform(X_TPC_4_test)

pca = PCA(n_components=0.6)

X_TPC_4_train_pca = pca.fit_transform(X_TPC_4_train_scaled)
X_TPC_4_test_pca = pca.transform(X_TPC_4_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_4_train_pca, y_TPC_4_train)

y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test_pca)
accuracy = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_5
scaler = StandardScaler()
X_TPC_5_train_scaled = scaler.fit_transform(X_TPC_5_train)
X_TPC_5_test_scaled = scaler.transform(X_TPC_5_test)

pca = PCA(n_components=0.6)

X_TPC_5_train_pca = pca.fit_transform(X_TPC_5_train_scaled)
X_TPC_5_test_pca = pca.transform(X_TPC_5_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_5_train_pca, y_TPC_5_train)

y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test_pca)
accuracy = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_6
scaler = StandardScaler()
X_TPC_6_train_scaled = scaler.fit_transform(X_TPC_6_train)
X_TPC_6_test_scaled = scaler.transform(X_TPC_6_test)

pca = PCA(n_components=0.6)

X_TPC_6_train_pca = pca.fit_transform(X_TPC_6_train_scaled)
X_TPC_6_test_pca = pca.transform(X_TPC_6_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_6_train_pca, y_TPC_6_train)

y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test_pca)
accuracy = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_7
scaler = StandardScaler()
X_TPC_7_train_scaled = scaler.fit_transform(X_TPC_7_train)
X_TPC_7_test_scaled = scaler.transform(X_TPC_7_test)

pca = PCA(n_components=0.6)

X_TPC_7_train_pca = pca.fit_transform(X_TPC_7_train_scaled)
X_TPC_7_test_pca = pca.transform(X_TPC_7_test_scaled)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_7_train_pca, y_TPC_7_train)

y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test_pca)
accuracy = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy * 100:.2f}%')

Model Accuracy(AAC): 60.00%
Model Accuracy(DPC): 57.50%
Model Accuracy(TPC): 45.00%
Model Accuracy(combination): 60.00%
Model Accuracy(TPC_4): 70.00%
Model Accuracy(TPC_5): 57.50%
Model Accuracy(TPC_6): 42.50%
Model Accuracy(TPC_7): 50.00%


In [261]:
from sklearn.feature_selection import RFE

scaler = StandardScaler()

rf_classifier = RandomForestClassifier()


# -------------------------------------------------
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe.transform(X_test_scaled)

rf_classifier.fit(X_train_rfe, y_train)
y_pred = rf_classifier.predict(X_test_rfe)

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# -------------------------------------------------
X_DPC_train_scaled = scaler.fit_transform(X_DPC_train)
X_DPC_test_scaled = scaler.transform(X_DPC_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_DPC_train_rfe = rfe.fit_transform(X_DPC_train_scaled, y_DPC_train)
X_DPC_test_rfe = rfe.transform(X_DPC_test_scaled)

rf_classifier.fit(X_DPC_train_rfe, y_DPC_train)
y_DPC_pred = rf_classifier.predict(X_DPC_test_rfe)

accuracy = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
X_TPC_train_scaled = scaler.fit_transform(X_TPC_train)
X_TPC_test_scaled = scaler.transform(X_TPC_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_train_rfe = rfe.fit_transform(X_TPC_train_scaled, y_TPC_train)
X_TPC_test_rfe = rfe.transform(X_TPC_test_scaled)

rf_classifier.fit(X_TPC_train_rfe, y_TPC_train)
y_TPC_pred = rf_classifier.predict(X_TPC_test_rfe)

accuracy = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
X_combination_train_scaled = scaler.fit_transform(X_combination_train)
X_combination_test_scaled = scaler.transform(X_combination_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_combination_train_rfe = rfe.fit_transform(X_combination_train_scaled, y_combination_train)
X_combination_test_rfe = rfe.transform(X_combination_test_scaled)

rf_classifier.fit(X_combination_train_rfe, y_combination_train)
y_combination_pred = rf_classifier.predict(X_combination_test_rfe)

accuracy = accuracy_score(y_combination_test, y_combination_pred)
print(f'Model Accuracy(combination): {accuracy * 100:.2f}%')


# -------------------------------------------------
# TPC_4
X_TPC_4_train_scaled = scaler.fit_transform(X_TPC_4_train)
X_TPC_4_test_scaled = scaler.transform(X_TPC_4_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_4_train_rfe = rfe.fit_transform(X_TPC_4_train_scaled, y_TPC_4_train)
X_TPC_4_test_rfe = rfe.transform(X_TPC_4_test_scaled)

rf_classifier.fit(X_TPC_4_train_rfe, y_TPC_4_train)
y_TPC_4_pred = rf_classifier.predict(X_TPC_4_test_rfe)

accuracy = accuracy_score(y_TPC_4_test, y_TPC_4_pred)
print(f'Model Accuracy(TPC_4): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_5
X_TPC_5_train_scaled = scaler.fit_transform(X_TPC_5_train)
X_TPC_5_test_scaled = scaler.transform(X_TPC_5_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_5_train_rfe = rfe.fit_transform(X_TPC_5_train_scaled, y_TPC_5_train)
X_TPC_5_test_rfe = rfe.transform(X_TPC_5_test_scaled)

rf_classifier.fit(X_TPC_5_train_rfe, y_TPC_5_train)
y_TPC_5_pred = rf_classifier.predict(X_TPC_5_test_rfe)

accuracy = accuracy_score(y_TPC_5_test, y_TPC_5_pred)
print(f'Model Accuracy(TPC_5): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_6
X_TPC_6_train_scaled = scaler.fit_transform(X_TPC_6_train)
X_TPC_6_test_scaled = scaler.transform(X_TPC_6_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_6_train_rfe = rfe.fit_transform(X_TPC_6_train_scaled, y_TPC_6_train)
X_TPC_6_test_rfe = rfe.transform(X_TPC_6_test_scaled)

rf_classifier.fit(X_TPC_6_train_rfe, y_TPC_6_train)
y_TPC_6_pred = rf_classifier.predict(X_TPC_6_test_rfe)

accuracy = accuracy_score(y_TPC_6_test, y_TPC_6_pred)
print(f'Model Accuracy(TPC_6): {accuracy * 100:.2f}%')

# -------------------------------------------------
# TPC_7
X_TPC_7_train_scaled = scaler.fit_transform(X_TPC_7_train)
X_TPC_7_test_scaled = scaler.transform(X_TPC_7_test)

rfe = RFE(estimator=rf_classifier, n_features_to_select=15)
X_TPC_7_train_rfe = rfe.fit_transform(X_TPC_7_train_scaled, y_TPC_7_train)
X_TPC_7_test_rfe = rfe.transform(X_TPC_7_test_scaled)

rf_classifier.fit(X_TPC_7_train_rfe, y_TPC_7_train)
y_TPC_7_pred = rf_classifier.predict(X_TPC_7_test_rfe)

accuracy = accuracy_score(y_TPC_7_test, y_TPC_7_pred)
print(f'Model Accuracy(TPC_7): {accuracy * 100:.2f}%')

Model Accuracy(AAC): 80.00%
Model Accuracy(DPC): 72.50%
Model Accuracy(TPC): 55.00%
Model Accuracy(combination): 62.50%
Model Accuracy(TPC_4): 72.50%
Model Accuracy(TPC_5): 72.50%
Model Accuracy(TPC_6): 65.00%
Model Accuracy(TPC_7): 70.00%


In [139]:
from scipy.stats import skew, kurtosis
from collections import Counter
from sklearn.preprocessing import StandardScaler

# 計算字符出現的概率
def calculate_probabilities(sequence):
    length = len(sequence)
    probabilities = {char: count / length for char, count in Counter(sequence).items()}
    return probabilities

# 計算熵
def calculate_entropy(sequence):
    probabilities = calculate_probabilities(sequence)
    entropy = -sum(p * (p and p != 1 and p != 0) * (p and p != 1 and p != 0) * (p and p != 1 and p != 0) for p in probabilities.values())
    return entropy

# 計算偏度
def calculate_skewness(sequence):
    frequencies = list(calculate_probabilities(sequence).values())
    skewness = skew(frequencies)
    return skewness

# 計算峰度
def calculate_kurtosis(sequence):
    frequencies = list(calculate_probabilities(sequence).values())
    kurt = kurtosis(frequencies)
    return kurt

X_train_with_stats = X_train.copy()
X_test_with_stats = X_test.copy()
X_DPC_train_with_stats = X_DPC_train.copy()
X_DPC_test_with_stats = X_DPC_test.copy()
X_TPC_train_with_stats = X_TPC_train.copy()
X_TPC_test_with_stats = X_TPC_test.copy()

# 對每個序列計算熵、偏度和峰度
X_train_with_stats['Entropy'] = nt15_train['Sequence'].apply(calculate_entropy)
X_train_with_stats['Skewness'] = nt15_train['Sequence'].apply(calculate_skewness)
X_train_with_stats['Kurtosis'] = nt15_train['Sequence'].apply(calculate_kurtosis)
X_test_with_stats['Entropy'] = nt15_test['Sequence'].apply(calculate_entropy)
X_test_with_stats['Skewness'] = nt15_test['Sequence'].apply(calculate_skewness)
X_test_with_stats['Kurtosis'] = nt15_test['Sequence'].apply(calculate_kurtosis)

X_DPC_train_with_stats['Entropy'] = nt15_train['Sequence'].apply(calculate_entropy)
X_DPC_train_with_stats['Skewness'] = nt15_train['Sequence'].apply(calculate_skewness)
X_DPC_train_with_stats['Kurtosis'] = nt15_train['Sequence'].apply(calculate_kurtosis)
X_DPC_test_with_stats['Entropy'] = nt15_test['Sequence'].apply(calculate_entropy)
X_DPC_test_with_stats['Skewness'] = nt15_test['Sequence'].apply(calculate_skewness)
X_DPC_test_with_stats['Kurtosis'] = nt15_test['Sequence'].apply(calculate_kurtosis)

X_TPC_train_with_stats['Entropy'] = nt15_train['Sequence'].apply(calculate_entropy)
X_TPC_train_with_stats['Skewness'] = nt15_train['Sequence'].apply(calculate_skewness)
X_TPC_train_with_stats['Kurtosis'] = nt15_train['Sequence'].apply(calculate_kurtosis)
X_TPC_test_with_stats['Entropy'] = nt15_test['Sequence'].apply(calculate_entropy)
X_TPC_test_with_stats['Skewness'] = nt15_test['Sequence'].apply(calculate_skewness)
X_TPC_test_with_stats['Kurtosis'] = nt15_test['Sequence'].apply(calculate_kurtosis)

# 如果熵、偏度和峰度為 NaN，則用平均值填充
X_train_with_stats['Entropy'] = X_train_with_stats['Entropy'].fillna(X_train_with_stats['Entropy'].mean())
X_train_with_stats['Skewness'] = X_train_with_stats['Skewness'].fillna(X_train_with_stats['Skewness'].mean())
X_train_with_stats['Kurtosis'] = X_train_with_stats['Kurtosis'].fillna(X_train_with_stats['Kurtosis'].mean())
X_test_with_stats['Entropy'] = X_test_with_stats['Entropy'].fillna(X_test_with_stats['Entropy'].mean())
X_test_with_stats['Skewness'] = X_test_with_stats['Skewness'].fillna(X_test_with_stats['Skewness'].mean())
X_test_with_stats['Kurtosis'] = X_test_with_stats['Kurtosis'].fillna(X_test_with_stats['Kurtosis'].mean())

X_DPC_train_with_stats['Entropy'] = X_DPC_train_with_stats['Entropy'].fillna(X_DPC_train_with_stats['Entropy'].mean())
X_DPC_train_with_stats['Skewness'] = X_DPC_train_with_stats['Skewness'].fillna(X_DPC_train_with_stats['Skewness'].mean())
X_DPC_train_with_stats['Kurtosis'] = X_DPC_train_with_stats['Kurtosis'].fillna(X_DPC_train_with_stats['Kurtosis'].mean())
X_DPC_test_with_stats['Entropy'] = X_DPC_test_with_stats['Entropy'].fillna(X_DPC_test_with_stats['Entropy'].mean())
X_DPC_test_with_stats['Skewness'] = X_DPC_test_with_stats['Skewness'].fillna(X_DPC_test_with_stats['Skewness'].mean())
X_DPC_test_with_stats['Kurtosis'] = X_DPC_test_with_stats['Kurtosis'].fillna(X_DPC_test_with_stats['Kurtosis'].mean())

X_TPC_train_with_stats['Entropy'] = X_TPC_train_with_stats['Entropy'].fillna(X_TPC_train_with_stats['Entropy'].mean())
X_TPC_train_with_stats['Skewness'] = X_TPC_train_with_stats['Skewness'].fillna(X_TPC_train_with_stats['Skewness'].mean())
X_TPC_train_with_stats['Kurtosis'] = X_TPC_train_with_stats['Kurtosis'].fillna(X_TPC_train_with_stats['Kurtosis'].mean())
X_TPC_test_with_stats['Entropy'] = X_TPC_test_with_stats['Entropy'].fillna(X_TPC_test_with_stats['Entropy'].mean())
X_TPC_test_with_stats['Skewness'] = X_TPC_test_with_stats['Skewness'].fillna(X_TPC_test_with_stats['Skewness'].mean())
X_TPC_test_with_stats['Kurtosis'] = X_TPC_test_with_stats['Kurtosis'].fillna(X_TPC_test_with_stats['Kurtosis'].mean())


# -------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_with_stats)
X_test_scaled = scaler.transform(X_test_with_stats)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train)

y_pred = rf_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy(AAC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_DPC_train_scaled = scaler.fit_transform(X_DPC_train_with_stats)
X_DPC_test_scaled = scaler.transform(X_DPC_test_with_stats)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_DPC_train_scaled, y_DPC_train)

y_DPC_pred = rf_classifier.predict(X_DPC_test_scaled)
accuracy = accuracy_score(y_DPC_test, y_DPC_pred)
print(f'Model Accuracy(DPC): {accuracy * 100:.2f}%')


# -------------------------------------------------
scaler = StandardScaler()
X_TPC_train_scaled = scaler.fit_transform(X_TPC_train_with_stats)
X_TPC_test_scaled = scaler.transform(X_TPC_test_with_stats)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_TPC_train_scaled, y_TPC_train)

y_TPC_pred = rf_classifier.predict(X_TPC_test_scaled)
accuracy = accuracy_score(y_TPC_test, y_TPC_pred)
print(f'Model Accuracy(TPC): {accuracy * 100:.2f}%')

Model Accuracy(AAC): 72.50%
Model Accuracy(DPC): 65.00%
Model Accuracy(TPC): 70.00%


In [143]:
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
combinations_CN = ['CN' + ''.join(comb) for comb in product(amino_acids, repeat=1)]
combinations_CS = ['CS' + ''.join(comb) for comb in product(amino_acids, repeat=1)]

all_combinations = combinations_CN + combinations_CS

print(all_combinations)

['CNA', 'CNC', 'CND', 'CNE', 'CNF', 'CNG', 'CNH', 'CNI', 'CNK', 'CNL', 'CNM', 'CNN', 'CNP', 'CNQ', 'CNR', 'CNS', 'CNT', 'CNV', 'CNW', 'CNY', 'CSA', 'CSC', 'CSD', 'CSE', 'CSF', 'CSG', 'CSH', 'CSI', 'CSK', 'CSL', 'CSM', 'CSN', 'CSP', 'CSQ', 'CSR', 'CSS', 'CST', 'CSV', 'CSW', 'CSY']
