# Anticancer Peptide (ACP) recognition using the BOW

## All necessary imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

RND_SEED = 42  # for reproducibility
np.random.seed(RND_SEED)

## Reading the ACP data

In [2]:
# Loading data
df = pd.read_csv('../data_instadeep/acp/train_data.csv')
df = df.sample(frac=1, random_state=RND_SEED)
print(df.shape)

(1378, 2)


In [3]:
df.head(5)

Unnamed: 0,sequences,label
597,GLFDIVKKIAGHIVSSI,1
700,KLKNFAKGVAQSLLNKASCKLSGQC,1
1222,GLWNSIKIAGKKLFVNVLDKIRCKVAGGCKTSPDVE,0
1145,GAFGNFLKGVAKKAGLKILSIAQCKLFGTC,1
602,INLKAIAALAKKLLG,0


In [4]:
print(f'{"Column":15s} # of unique values')
print(f'{"-"*40:60s}')
for col in df:
    print(f'{col:15s} {len(df[col].unique())}')

Column          # of unique values
----------------------------------------                    
sequences       1378
label           2


In [5]:
# Is the data balanced?
df['label'].value_counts(normalize=True)

1    0.5
0    0.5
Name: label, dtype: float64

## Bag Of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
set_chars = set()
for i in list(df["sequences"]):
    set_chars = set_chars | set(i)
list_chars = list(set_chars)
list_chars.sort()
' '.join(list_chars)

'A C D E F G H I K L M N P Q R S T V W Y'

In [8]:
from collections import Counter

In [9]:
df.head(10)

Unnamed: 0,sequences,label
597,GLFDIVKKIAGHIVSSI,1
700,KLKNFAKGVAQSLLNKASCKLSGQC,1
1222,GLWNSIKIAGKKLFVNVLDKIRCKVAGGCKTSPDVE,0
1145,GAFGNFLKGVAKKAGLKILSIAQCKLFGTC,1
602,INLKAIAALAKKLLG,0
1078,RRIIIRWRRI,0
65,KWKLFKKIPKFLHSAKKF,1
745,VAGPFRIPPLRREFQ,0
478,GLGSILGKILNVAGKVGKTIGKVADAVGNKE,0
67,ESDTVTCRKMKGKCSFLLCPFFKRSSGTCYNGLAKCCRPFW,0


In [10]:
counts = [[Counter(i)[list_chars[j]] for j, c in enumerate(list_chars)] for i in list(df["sequences"])]
counts

[[1, 0, 1, 0, 1, 2, 1, 4, 2, 1, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0],
 [3, 2, 0, 0, 1, 2, 0, 0, 5, 4, 0, 2, 0, 2, 0, 3, 0, 1, 0, 0],
 [2, 2, 2, 1, 1, 4, 0, 3, 6, 3, 0, 2, 1, 0, 1, 2, 1, 4, 1, 0],
 [4, 2, 0, 0, 3, 5, 0, 2, 5, 4, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
 [4, 0, 0, 0, 0, 1, 0, 2, 3, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 1, 0],
 [1, 0, 0, 0, 3, 0, 1, 1, 7, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
 [1, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 3, 1, 3, 0, 0, 1, 0, 0],
 [3, 0, 1, 1, 0, 7, 0, 3, 5, 3, 0, 2, 0, 0, 0, 1, 1, 4, 0, 0],
 [1, 6, 1, 1, 4, 3, 0, 0, 5, 3, 1, 1, 2, 0, 3, 4, 3, 1, 1, 1],
 [4, 0, 0, 0, 1, 1, 0, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [4, 0, 0, 0, 2, 0, 2, 4, 0, 2, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0],
 [0, 6, 0, 1, 0, 5, 0, 1, 0, 1, 0, 2, 3, 0, 1, 2, 4, 2, 1, 0],
 [0, 6, 1, 1, 0, 4, 1, 0, 2, 5, 1, 1, 0, 0, 7, 1, 2, 1, 0, 2],
 [1, 0, 0, 1, 2, 6, 1, 2, 0, 0, 0, 0, 2, 0, 0, 1, 1, 2, 0, 2],
 [3, 2, 0, 0, 3, 1, 0, 4, 4, 2, 1, 0, 2, 0, 0, 1, 1, 0,

In [11]:
df_counts = pd.DataFrame(data=counts, columns=list_chars)
df_counts["id"] = list(range(df.shape[0]))
df_counts

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,id
0,1,0,1,0,1,2,1,4,2,1,...,0,0,0,0,2,0,2,0,0,0
1,3,2,0,0,1,2,0,0,5,4,...,2,0,2,0,3,0,1,0,0,1
2,2,2,2,1,1,4,0,3,6,3,...,2,1,0,1,2,1,4,1,0,2
3,4,2,0,0,3,5,0,2,5,4,...,1,0,1,0,1,1,1,0,0,3
4,4,0,0,0,0,1,0,2,3,4,...,1,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373,1,0,1,0,0,6,0,0,0,6,...,2,0,0,0,2,2,3,0,0,1373
1374,6,6,1,0,1,5,1,1,1,2,...,2,0,0,4,4,1,3,0,2,1374
1375,0,2,0,0,0,6,0,2,2,5,...,0,0,0,0,2,0,2,0,0,1375
1376,5,6,0,0,0,4,0,0,2,3,...,6,0,1,0,2,3,3,3,1,1376


## Sequence Classification using Deep Learning

In [12]:
X = df_counts.set_index("id")
X

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1,0,1,0,1,2,1,4,2,1,0,0,0,0,0,2,0,2,0,0
1,3,2,0,0,1,2,0,0,5,4,0,2,0,2,0,3,0,1,0,0
2,2,2,2,1,1,4,0,3,6,3,0,2,1,0,1,2,1,4,1,0
3,4,2,0,0,3,5,0,2,5,4,0,1,0,1,0,1,1,1,0,0
4,4,0,0,0,0,1,0,2,3,4,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373,1,0,1,0,0,6,0,0,0,6,0,2,0,0,0,2,2,3,0,0
1374,6,6,1,0,1,5,1,1,1,2,0,2,0,0,4,4,1,3,0,2
1375,0,2,0,0,0,6,0,2,2,5,0,0,0,0,0,2,0,2,0,0
1376,5,6,0,0,0,4,0,0,2,3,0,6,0,1,0,2,3,3,3,1


In [13]:
y = np.array(df['label'])
y

array([1, 1, 0, ..., 0, 0, 1])

We will perform a 10-fold cross-validation to measure the performance of the classification model.

In [14]:
kfold = 10
random_state = 1

xval_accuracy = np.zeros(kfold)
xval_recall = np.zeros(kfold)
xval_f1 = np.zeros(kfold)
skf = KFold(n_splits = kfold, shuffle = True, random_state = random_state)
k = 0
epochs = 50
batch_size = 128

models = []
for train_index, xval_index in skf.split(X, y):
    X_train, X_xval = X.iloc[train_index], X.iloc[xval_index]
    y_train, y_xval = y[train_index], y[xval_index]
    
    model = Sequential()
    model.add(Dense(64, input_shape = (X_train.shape[1],), activation='relu')) 
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
    models.append(model)
    
    y_pred = model.predict(X_xval).round().astype(int)

    xval_accuracy[k] = sklearn.metrics.accuracy_score(y_xval, y_pred)
    xval_recall[k] = sklearn.metrics.recall_score(y_xval, y_pred)
    xval_f1[k] = sklearn.metrics.f1_score(y_xval, y_pred)
    k+=1
    
print ('Average accuracy score', np.mean(xval_accuracy))
print ('Average recall score', np.mean(xval_recall))
print ('Average f1 score', np.mean(xval_f1))

2022-10-05 23:05:50.435253: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Average accuracy score 0.6814291759229874
Average recall score 0.5893910798015127
Average f1 score 0.6451878803031386


BoW embedding achieves average accuracy of 68.1%