# Antimicrobial Peptide (AMP) recognition using the BOW

## All necessary imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

RND_SEED = 42  # for reproducibility
np.random.seed(RND_SEED)

## Reading the AMP data

In [2]:
# Loading data
df = pd.read_csv('../data_instadeep/amp/all_data.csv')
df = df.sample(frac=1, random_state=RND_SEED)
print(df.shape)

(4042, 3)


In [3]:
df.head(5)

Unnamed: 0,PDBs_code,SequenceID,label
4003,UniRef50_Q8BVR0,EKDVAWVDGISRELAINLVTKGFNKAYVLLGQFLLMHKNEAEFQRW...,0
149,AP00685,GIMDTVKGVAKTVAASLLDKLKCKITGC,1
2025,UniRef50_Q10474,QISVTKAGRPYCQSAHCPPIIFD,0
2505,UniRef50_Q8TAP6,TVWEDQLSYLLSPALASYEFERTTS,0
3203,UniRef50_Q5BFN9,NEDFELEGANPSAE,0


In [4]:
print(f'{"Column":15s} # of unique values')
print(f'{"-"*40:60s}')
for col in df:
    print(f'{col:15s} {len(df[col].unique())}')

Column          # of unique values
----------------------------------------                    
PDBs_code       4042
SequenceID      4042
label           2


In [5]:
# Is the data balanced?
df['label'].value_counts(normalize=True)

0    0.5
1    0.5
Name: label, dtype: float64

## Bag Of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
set_chars = set()
for i in list(df["SequenceID"]):
    set_chars = set_chars | set(i)
list_chars = list(set_chars)
list_chars.sort()
' '.join(list_chars)

'A C D E F G H I K L M N P Q R S T V W X Y'

In [8]:
from collections import Counter

In [9]:
df.head(10)

Unnamed: 0,PDBs_code,SequenceID,label
4003,UniRef50_Q8BVR0,EKDVAWVDGISRELAINLVTKGFNKAYVLLGQFLLMHKNEAEFQRW...,0
149,AP00685,GIMDTVKGVAKTVAASLLDKLKCKITGC,1
2025,UniRef50_Q10474,QISVTKAGRPYCQSAHCPPIIFD,0
2505,UniRef50_Q8TAP6,TVWEDQLSYLLSPALASYEFERTTS,0
3203,UniRef50_Q5BFN9,NEDFELEGANPSAE,0
1811,AP01936,FFPIVGKLLSGLL,1
2024,UniRef50_A7HBQ0,GHLGVAGSEGALATVRRIALRREGEPEPPLQEVLEARPQGRLWAVR...,0
2020,AP02211,FLNALKNFAKTAGKRLKSLLN,1
952,AP01795,QIINNPITCMTNGAICWGPCPTAFRQIGNCGHFKVRCCKIR,1
1041,AP02875,LKRVWKRVFKLLKRYWRQLKKPVR,1


In [10]:
counts = [[Counter(i)[list_chars[j]] for j, c in enumerate(list_chars)] for i in list(df["SequenceID"])]
counts

[[5, 3, 2, 4, 3, 4, 1, 4, 4, 6, 1, 3, 0, 2, 2, 1, 1, 4, 2, 0, 1],
 [3, 2, 2, 0, 0, 3, 0, 2, 5, 3, 1, 0, 0, 0, 0, 1, 3, 3, 0, 0, 0],
 [2, 2, 1, 0, 1, 1, 1, 3, 1, 0, 0, 0, 3, 2, 1, 2, 1, 1, 0, 0, 1],
 [2, 0, 1, 3, 1, 0, 0, 0, 0, 4, 0, 0, 1, 1, 1, 4, 3, 1, 1, 0, 2],
 [2, 0, 1, 4, 1, 1, 0, 0, 0, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 2, 2, 0, 1, 1, 4, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0],
 [10, 0, 1, 9, 0, 8, 1, 2, 0, 7, 0, 0, 4, 2, 8, 3, 2, 7, 2, 0, 0],
 [3, 0, 0, 0, 2, 1, 0, 0, 4, 5, 0, 3, 0, 0, 1, 1, 1, 0, 0, 0, 0],
 [2, 6, 0, 0, 2, 4, 1, 6, 2, 0, 1, 4, 3, 2, 3, 0, 3, 1, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 6, 4, 0, 0, 1, 1, 5, 0, 0, 3, 2, 0, 1],
 [8, 4, 3, 1, 2, 22, 2, 5, 2, 2, 2, 12, 4, 1, 7, 15, 4, 5, 0, 0, 1],
 [2, 8, 3, 0, 2, 3, 3, 2, 2, 3, 2, 2, 3, 5, 1, 1, 0, 3, 2, 0, 0],
 [1, 6, 0, 0, 4, 6, 2, 1, 3, 2, 0, 1, 2, 0, 1, 4, 0, 1, 2, 0, 0],
 [0, 1, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 1],
 [8, 2, 1, 1, 1, 8, 1, 2, 4, 5, 1, 8, 4, 0, 2, 2, 0, 1, 1, 0, 1],
 [3, 2

In [11]:
df["PDBs_code"]

4003    UniRef50_Q8BVR0
149             AP00685
2025    UniRef50_Q10474
2505    UniRef50_Q8TAP6
3203    UniRef50_Q5BFN9
             ...       
1130            AP02814
1294            AP00264
860             AP01242
3507    UniRef50_P38483
3174    UniRef50_P34893
Name: PDBs_code, Length: 4042, dtype: object

In [12]:
df_counts = pd.DataFrame(data=counts, columns=list_chars)
df_counts["id"] = list(df["PDBs_code"])
df_counts

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,X,Y,id
0,5,3,2,4,3,4,1,4,4,6,...,0,2,2,1,1,4,2,0,1,UniRef50_Q8BVR0
1,3,2,2,0,0,3,0,2,5,3,...,0,0,0,1,3,3,0,0,0,AP00685
2,2,2,1,0,1,1,1,3,1,0,...,3,2,1,2,1,1,0,0,1,UniRef50_Q10474
3,2,0,1,3,1,0,0,0,0,4,...,1,1,1,4,3,1,1,0,2,UniRef50_Q8TAP6
4,2,0,1,4,1,1,0,0,0,1,...,1,0,0,1,0,0,0,0,0,UniRef50_Q5BFN9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4037,7,0,1,0,1,6,2,1,6,2,...,1,1,4,3,0,2,0,0,0,AP02814
4038,1,6,1,0,4,3,0,2,5,4,...,1,0,5,5,1,0,0,0,1,AP00264
4039,0,0,0,0,1,3,2,3,1,4,...,3,0,0,2,0,2,0,0,0,AP01242
4040,2,1,2,2,2,3,3,5,0,4,...,2,3,6,3,3,3,1,0,5,UniRef50_P38483


## Sequence Classification using Deep Learning

In [13]:
X = df_counts.set_index("id")
X

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,X,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UniRef50_Q8BVR0,5,3,2,4,3,4,1,4,4,6,...,3,0,2,2,1,1,4,2,0,1
AP00685,3,2,2,0,0,3,0,2,5,3,...,0,0,0,0,1,3,3,0,0,0
UniRef50_Q10474,2,2,1,0,1,1,1,3,1,0,...,0,3,2,1,2,1,1,0,0,1
UniRef50_Q8TAP6,2,0,1,3,1,0,0,0,0,4,...,0,1,1,1,4,3,1,1,0,2
UniRef50_Q5BFN9,2,0,1,4,1,1,0,0,0,1,...,2,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AP02814,7,0,1,0,1,6,2,1,6,2,...,0,1,1,4,3,0,2,0,0,0
AP00264,1,6,1,0,4,3,0,2,5,4,...,0,1,0,5,5,1,0,0,0,1
AP01242,0,0,0,0,1,3,2,3,1,4,...,0,3,0,0,2,0,2,0,0,0
UniRef50_P38483,2,1,2,2,2,3,3,5,0,4,...,3,2,3,6,3,3,3,1,0,5


In [14]:
y = np.array(df['label'])

We will perform a 10-fold cross-validation to measure the performance of the classification model.

In [15]:
kfold = 10
random_state = 1

test_accuracy = np.zeros(kfold)
test_recall = np.zeros(kfold)
test_f1 = np.zeros(kfold)
skf = KFold(n_splits = kfold, shuffle = True, random_state = random_state)
k = 0
epochs = 50
batch_size = 128

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = Sequential()
    model.add(Dense(64, input_shape = (X_train.shape[1],), activation='relu')) 
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
    
    y_pred = model.predict(X_test).round().astype(int)

    test_accuracy[k] = sklearn.metrics.accuracy_score(y_test, y_pred)
    test_recall[k] = sklearn.metrics.recall_score(y_test, y_pred)
    test_f1[k] = sklearn.metrics.f1_score(y_test, y_pred)
    k+=1
    
print ('Average accuracy score', np.mean(test_accuracy))
print ('Average recall score', np.mean(test_recall))
print ('Average f1 score', np.mean(test_f1))

2022-10-05 23:26:16.499725: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Average accuracy score 0.9064869820315365
Average recall score 0.8880832457433024
Average f1 score 0.9044190294813701


Given the simplicity of BoW embedding, accuracy of 90.6% seems pretty good!