In [75]:
%matplotlib inline
import numpy as np
import pandas as pd
import csv
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

with open('benchmarkdataset_train.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
bench_train = pd.DataFrame(data)

print(bench_train)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix_train = np.zeros((len(sequences), len(amino_acids)))

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix_train[i, j] = sequence.count(amino_acid)

# 計算每種胺基酸在序列中的比例
percentage_matrix_train = count_matrix_train / len(sequences[0])
print('percentage_train in 1-D:')
print(percentage_matrix_train)

print()
# 顯示比例矩陣
df_percentage_train = pd.DataFrame(percentage_matrix_train, columns=list(amino_acids))
print(df_percentage_train)

# 合併特徵資料和目標資料
df_combined_train = pd.concat([df_percentage_train, bench_train['Header']], axis=1)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_train['Target'] = df_combined_train['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_train, y_train = df_combined_train.drop(['Header', 'Target'], axis=1), df_combined_train['Target']

#print(X_train)
#print(y_train)
#print(y_train.value_counts())

      Header                Sequence
0      >AA29         FLKDHRISTFKNWPF
1      >AA30    FLSSRLQDLYSIVRRADRAA
2      >AA31            GDVIDTDRDIDR
3      >AA32          GFHDHGPCDPPSHK
4      >AA33       GHRATSDLASTGEESQD
..       ...                     ...
208  >neg131       VVRLAREPGKRESRYMH
209  >neg132       YEDLRDESLKGLVDIGF
210  >neg133  YFLIQSVSSTVMLLNGLYIFVN
211  >neg134         YGEPGMQLFVYGREE
212  >neg135    YNLSDTIKAFSILLLTDLCI

[213 rows x 2 columns]

percentage_train in 1-D:
[[0.         0.         0.06666667 ... 0.         0.06666667 0.        ]
 [0.2        0.         0.13333333 ... 0.06666667 0.         0.06666667]
 [0.         0.         0.33333333 ... 0.06666667 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.2        0.         0.13333333]
 [0.         0.         0.         ... 0.06666667 0.         0.13333333]
 [0.06666667 0.06666667 0.13333333 ... 0.         0.         0.06666667]]

            A         C         D         E         F        

In [76]:
with open('benchmarkdataset_test.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
bench_test = pd.DataFrame(data)

#print(bench_test)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix_test = np.zeros((len(sequences), len(amino_acids)))

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix_test[i, j] = sequence.count(amino_acid)

# 計算每種胺基酸在序列中的比例
percentage_matrix_test = count_matrix_test / len(sequences[0])
print('percentage_test in 1-D:')
print(percentage_matrix_test)

print()
# 顯示比例矩陣
df_percentage_test = pd.DataFrame(percentage_matrix_test, columns=list(amino_acids))
#print(df_percentage_test)

# 合併特徵資料和目標資料
df_combined_test = pd.concat([df_percentage_test, bench_test['Header']], axis=1)

# 將目標轉換為二元標籤（1 表示 'AA'，0 表示 'neg'）
df_combined_test['Target'] = df_combined_test['Header'].apply(lambda x: 1 if 'AA' in x else 0)

# 分割資料集
X_test, y_test = df_combined_test.drop(['Header', 'Target'], axis=1), df_combined_test['Target']

#print(X_test)
#print(y_test)
#print(y_test.value_counts())


percentage_test in 1-D:
[[0.15789474 0.10526316 0.         ... 0.         0.         0.        ]
 [0.10526316 0.         0.10526316 ... 0.10526316 0.05263158 0.        ]
 [0.36842105 0.         0.05263158 ... 0.05263158 0.         0.        ]
 ...
 [0.15789474 0.         0.05263158 ... 0.         0.         0.        ]
 [0.21052632 0.         0.05263158 ... 0.10526316 0.         0.05263158]
 [0.31578947 0.05263158 0.15789474 ... 0.15789474 0.         0.05263158]]



In [79]:
svm_model = SVC()
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

Model Accuracy: 50.91%


In [54]:
with open('NT15dataset_train.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
nt15_train = pd.DataFrame(data)

print(nt15_train)

print()
# 計算每種胺基酸的出現次數
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
count_matrix = np.zeros((len(sequences), len(amino_acids)))

for i, sequence in enumerate(sequences):
    for j, amino_acid in enumerate(amino_acids):
        count_matrix[i, j] = sequence.count(amino_acid)

# 計算每種胺基酸在序列中的比例
percentage_matrix = count_matrix / len(sequences[0])
print('percentage in 1-D:')
print(percentage_matrix)

print()
# 顯示比例矩陣
df_percentage = pd.DataFrame(percentage_matrix, columns=list(amino_acids))
print(df_percentage)

      Header         Sequence
0      >AA26  EKYEGKISKTMSGLD
1      >AA27  ESLARPCAPGAPAEA
2      >AA28  FCNINNVCNFASRND
3      >AA29  FLKDHRISTFKNWPF
4      >AA30  FLSSRLQDLYSIVRR
..       ...              ...
155  >neg131  VVRLAREPGKRESRY
156  >neg132  YEDLRDESLKGLVDI
157  >neg133  YFLIQSVSSTVMLLN
158  >neg134  YGEPGMQLFVYGREE
159  >neg135  YNLSDTIKAFSILLL

[160 rows x 2 columns]

percentage in 1-D:
[[0.         0.         0.06666667 ... 0.         0.         0.06666667]
 [0.33333333 0.06666667 0.         ... 0.         0.         0.        ]
 [0.06666667 0.13333333 0.06666667 ... 0.06666667 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.13333333 0.         0.06666667]
 [0.         0.         0.         ... 0.06666667 0.         0.13333333]
 [0.06666667 0.         0.06666667 ... 0.         0.         0.06666667]]

            A         C         D         E         F         G         H  \
0    0.000000  0.000000  0.066667  0.133333  0.000000  0.133333  0.000000 

In [42]:
with open('NT15dataset_test.fasta', 'r') as file:
    lines = file.readlines()

# 初始化兩個空的列表，分別用於存儲標題（Header）和序列（Sequence）
headers = []
sequences = []

# 遍歷 FASTA 文件的每一行
for line in lines:
    line = line.strip()  # 去掉行尾的空白字符

    # 如果行以 '>AA' 開頭，則視為標題
    if line.startswith('>AA') or line.startswith('>neg'):
        headers.append(line)
        sequences.append('')
    else:
        sequences[-1] += line

# 將標題和序列轉換為 DataFrame
data = {'Header': headers, 'Sequence': sequences}
nt15_test = pd.DataFrame(data)

#print(nt15_test)