# DNA-binding Protein recognition using the BOW

## All necessary imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

RND_SEED = 42  # for reproducibility
np.random.seed(RND_SEED)

## Reading the DBP data

In [4]:
# Loading data
df = pd.read_csv('../../data/dna_binding/train.csv')
df = df.sample(frac=1, random_state=RND_SEED)
print(df.shape)

(14189, 4)


In [5]:
df.head(5)

Unnamed: 0,code,sequence,label,origin
5646,Q9YEZ5,MADARFYFSDARTWRYMVASIEKIIEEGVFVATGEGLSLRALDTSH...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2012,Q24762,MSAAGDAGAGAANGSNNVAVVQATVSVSGNISVGDGSPNNNNNNNA...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
14156,B2V652,MEVITNPGQMQTLMLSLKKQGKKIGFVPTMGYLHEGHLSLIRCSKK...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
8852,Q5HWG0,MLEGIVRESIGRKAAKALKRDGYLIANIYGKGLENINAAFKVNEFI...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
12836,Q492D3,MIHGIGIDIVDIRKIKKIITHSGDKLATRILSKSEWKIYKNKKHPV...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


In [6]:
print(f'{"Column":15s} # of unique values')
print(f'{"-"*40:60s}')
for col in df:
    print(f'{col:15s} {len(df[col].unique())}')

Column          # of unique values
----------------------------------------                    
code            14189
sequence        14016
label           2
origin          1


In [7]:
# Is the data balanced?
df['label'].value_counts(normalize=True)

1    0.502431
0    0.497569
Name: label, dtype: float64

## Bag Of Words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
set_chars = set()
for i in list(df["sequence"]):
    set_chars = set_chars | set(i)
list_chars = list(set_chars)
list_chars.sort()
' '.join(list_chars)

'A B C D E F G H I K L M N O P Q R S T U V W X Y'

In [11]:
from collections import Counter

In [13]:
counts = [[Counter(i)[list_chars[j]] for j, c in enumerate(list_chars)] for i in list(df["sequence"])]
counts

[[23,
  0,
  0,
  14,
  28,
  12,
  14,
  2,
  9,
  8,
  26,
  8,
  1,
  0,
  7,
  4,
  20,
  25,
  12,
  0,
  24,
  1,
  0,
  11],
 [77,
  0,
  2,
  20,
  33,
  9,
  29,
  19,
  24,
  29,
  34,
  10,
  33,
  0,
  24,
  132,
  19,
  40,
  32,
  0,
  40,
  2,
  0,
  10],
 [11,
  0,
  5,
  18,
  21,
  16,
  17,
  4,
  19,
  25,
  21,
  8,
  15,
  0,
  13,
  11,
  13,
  15,
  9,
  0,
  28,
  0,
  0,
  11],
 [16,
  0,
  1,
  11,
  9,
  5,
  13,
  1,
  13,
  22,
  16,
  3,
  7,
  0,
  4,
  4,
  9,
  6,
  7,
  0,
  27,
  0,
  0,
  4],
 [10, 0, 2, 5, 4, 7, 7, 6, 15, 17, 12, 3, 6, 0, 2, 2, 4, 8, 7, 0, 5, 1, 0, 2],
 [23,
  0,
  0,
  11,
  11,
  6,
  12,
  4,
  10,
  16,
  20,
  4,
  9,
  0,
  10,
  9,
  15,
  13,
  11,
  0,
  14,
  2,
  0,
  5],
 [45,
  0,
  2,
  37,
  36,
  15,
  41,
  12,
  27,
  25,
  49,
  12,
  11,
  0,
  28,
  9,
  50,
  15,
  25,
  0,
  39,
  5,
  0,
  4],
 [31,
  0,
  3,
  23,
  25,
  10,
  33,
  8,
  21,
  8,
  33,
  5,
  7,
  0,
  17,
  6,
  26,
  27,
  18,
  0,
  45,

In [14]:
df_counts = pd.DataFrame(data=counts, columns=list_chars)
df_counts["id"] = list(range(df.shape[0]))
df_counts

Unnamed: 0,A,B,C,D,E,F,G,H,I,K,...,Q,R,S,T,U,V,W,X,Y,id
0,23,0,0,14,28,12,14,2,9,8,...,4,20,25,12,0,24,1,0,11,0
1,77,0,2,20,33,9,29,19,24,29,...,132,19,40,32,0,40,2,0,10,1
2,11,0,5,18,21,16,17,4,19,25,...,11,13,15,9,0,28,0,0,11,2
3,16,0,1,11,9,5,13,1,13,22,...,4,9,6,7,0,27,0,0,4,3
4,10,0,2,5,4,7,7,6,15,17,...,2,4,8,7,0,5,1,0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14184,36,0,3,19,22,9,22,6,33,24,...,10,18,21,23,0,20,0,0,6,14184
14185,13,0,5,13,14,6,15,1,11,9,...,11,16,10,9,0,22,4,0,10,14185
14186,31,0,6,21,20,11,16,12,24,21,...,6,16,18,17,0,18,2,0,13,14186
14187,18,0,0,11,21,12,23,2,21,19,...,8,11,23,19,0,24,0,0,9,14187


## Sequence Classification using Deep Learning

In [15]:
X = df_counts.set_index("id")
X

Unnamed: 0_level_0,A,B,C,D,E,F,G,H,I,K,...,P,Q,R,S,T,U,V,W,X,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,23,0,0,14,28,12,14,2,9,8,...,7,4,20,25,12,0,24,1,0,11
1,77,0,2,20,33,9,29,19,24,29,...,24,132,19,40,32,0,40,2,0,10
2,11,0,5,18,21,16,17,4,19,25,...,13,11,13,15,9,0,28,0,0,11
3,16,0,1,11,9,5,13,1,13,22,...,4,4,9,6,7,0,27,0,0,4
4,10,0,2,5,4,7,7,6,15,17,...,2,2,4,8,7,0,5,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14184,36,0,3,19,22,9,22,6,33,24,...,16,10,18,21,23,0,20,0,0,6
14185,13,0,5,13,14,6,15,1,11,9,...,14,11,16,10,9,0,22,4,0,10
14186,31,0,6,21,20,11,16,12,24,21,...,12,6,16,18,17,0,18,2,0,13
14187,18,0,0,11,21,12,23,2,21,19,...,9,8,11,23,19,0,24,0,0,9


In [16]:
y = df['label']
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
y.head(10)

5646     1
2012     1
14156    0
8852     0
12836    0
9494     0
7918     0
11107    0
11600    0
7551     0
Name: label, dtype: int64

We will perform a 10-fold cross-validation to measure the performance of the classification model.

In [17]:
kfold = 10
X = X
y = encoded_y

random_state = 1

xval_accuracy = np.zeros(kfold)
xval_recall = np.zeros(kfold)
xval_f1 = np.zeros(kfold)
skf = KFold(n_splits = kfold, shuffle = True, random_state = random_state)
k = 0
epochs = 50
batch_size = 128

models = []
for train_index, xval_index in skf.split(X, y):
    X_train, X_xval = X.iloc[train_index], X.iloc[xval_index]
    y_train, y_xval = y[train_index], y[xval_index]
    
    model = Sequential()
    model.add(Dense(64, input_shape = (X_train.shape[1],))) 
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
    models.append(model)
    
    y_pred = model.predict(X_xval).round().astype(int)

    xval_accuracy[k] = sklearn.metrics.accuracy_score(y_xval, y_pred)
    xval_recall[k] = sklearn.metrics.recall_score(y_xval, y_pred)
    xval_f1[k] = sklearn.metrics.f1_score(y_xval, y_pred)
    k+=1
    
print ('Average accuracy score', np.mean(xval_accuracy))
print ('Average recall score', np.mean(xval_recall))
print ('Average f1 score', np.mean(xval_f1))

2022-09-29 20:27:28.136275: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Average accuracy score 0.788991830596449
Average recall score 0.8294592861785686
Average f1 score 0.7977736173552225


In [18]:
xval_accuracy

array([0.7914024 , 0.78153629, 0.78717407, 0.76744186, 0.79281184,
       0.79633545, 0.79422128, 0.79422128, 0.79069767, 0.79407616])

In [19]:
xval_accuracy.argmax()

5

## Test accuracy

In [24]:
# Loading the test data
df_test = pd.read_csv('../../data/dna_binding/test.csv')
print(df_test.shape)
df_test

(2272, 4)


Unnamed: 0,code,sequence,label,origin
0,P27204|1,AKKRSRSRKRSASRKRSRSRKRSASKKSSKKHVRKALAAGMKNHLL...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
1,P53528|1,MVMVVNPLTAGLDDEQREAVLAPRGPVCVLAGAGTGKTRTITHRIA...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2,P52684|1,MKDDINQEITFRKLSVFMMFMAKGNIARTAEAMKLSSVSVHRALHT...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
3,P10961|1,MNNAANTGTTNESNVSDAPRIEPLPSLNDDDIEKILQPNDIFTTDR...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
4,P06023|1,MAKPAKRIKSAAAAYVPQNRDAVITDIKRIGDLQREASRLETEMND...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
...,...,...,...,...
2267,P37471|2,MNFSRERTITEIQNDYKEQVERQNQLKKRRRKGLYRRLTVFGALVF...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2268,P07078|2,MVVVDKEIKKGQYYLVNGNVVRVTYVNGFDVYYLILKLHKRMICDR...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2269,P16793|2,MNPSTHVSSNGPTTPPHGPHTTFLPPTSPAPSTSSVAAATLCSPQR...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2270,P80484|2,MVRSGKKAVVLAAVAFCATSVVQKSHGFVPSPLRQRAAAAGAAAAS...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


In [25]:
# The test data sequence has a lowercase 'v', but the model expects all uppercase.
# So just transforming everything to uppercase
df_test["sequence"] = df_test["sequence"].transform(lambda x: x.upper())
df_test

Unnamed: 0,code,sequence,label,origin
0,P27204|1,AKKRSRSRKRSASRKRSRSRKRSASKKSSKKHVRKALAAGMKNHLL...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
1,P53528|1,MVMVVNPLTAGLDDEQREAVLAPRGPVCVLAGAGTGKTRTITHRIA...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2,P52684|1,MKDDINQEITFRKLSVFMMFMAKGNIARTAEAMKLSSVSVHRALHT...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
3,P10961|1,MNNAANTGTTNESNVSDAPRIEPLPSLNDDDIEKILQPNDIFTTDR...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
4,P06023|1,MAKPAKRIKSAAAAYVPQNRDAVITDIKRIGDLQREASRLETEMND...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
...,...,...,...,...
2267,P37471|2,MNFSRERTITEIQNDYKEQVERQNQLKKRRRKGLYRRLTVFGALVF...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2268,P07078|2,MVVVDKEIKKGQYYLVNGNVVRVTYVNGFDVYYLILKLHKRMICDR...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2269,P16793|2,MNPSTHVSSNGPTTPPHGPHTTFLPPTSPAPSTSSVAAATLCSPQR...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2270,P80484|2,MVRSGKKAVVLAAVAFCATSVVQKSHGFVPSPLRQRAAAAGAAAAS...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


In [26]:
counts_test = [[Counter(i)[list_chars[j]] for j, c in enumerate(list_chars)] for i in list(df_test["sequence"])]
counts_test

[[14, 0, 0, 0, 0, 1, 2, 3, 1, 21, 4, 1, 3, 0, 4, 0, 27, 14, 1, 0, 5, 0, 0, 0],
 [108,
  0,
  9,
  41,
  45,
  19,
  45,
  13,
  24,
  13,
  78,
  6,
  14,
  0,
  34,
  33,
  63,
  39,
  39,
  0,
  67,
  8,
  0,
  16],
 [32,
  0,
  2,
  19,
  22,
  15,
  19,
  6,
  17,
  12,
  41,
  15,
  10,
  0,
  9,
  9,
  24,
  14,
  14,
  0,
  19,
  1,
  0,
  8],
 [38,
  0,
  0,
  65,
  46,
  31,
  33,
  22,
  44,
  42,
  58,
  14,
  120,
  0,
  50,
  31,
  39,
  93,
  60,
  0,
  26,
  6,
  0,
  15],
 [21,
  0,
  1,
  9,
  16,
  4,
  12,
  0,
  18,
  13,
  10,
  4,
  6,
  0,
  7,
  6,
  13,
  8,
  11,
  0,
  12,
  2,
  0,
  1],
 [41,
  0,
  3,
  34,
  73,
  28,
  38,
  14,
  109,
  132,
  76,
  13,
  58,
  0,
  29,
  20,
  20,
  39,
  55,
  0,
  25,
  3,
  0,
  38],
 [37,
  0,
  1,
  18,
  18,
  2,
  17,
  10,
  12,
  7,
  41,
  6,
  6,
  0,
  15,
  13,
  45,
  19,
  8,
  0,
  23,
  4,
  0,
  3],
 [46,
  0,
  10,
  78,
  96,
  40,
  38,
  24,
  75,
  85,
  126,
  18,
  76,
  0,
  40,
  45,
  53,
  

In [27]:
len(counts_test)

2272

In [28]:
df_counts_test = pd.DataFrame(data=counts_test, columns=list_chars)
df_counts_test["id"] = list(range(df_test.shape[0]))
df_counts_test

Unnamed: 0,A,B,C,D,E,F,G,H,I,K,...,Q,R,S,T,U,V,W,X,Y,id
0,14,0,0,0,0,1,2,3,1,21,...,0,27,14,1,0,5,0,0,0,0
1,108,0,9,41,45,19,45,13,24,13,...,33,63,39,39,0,67,8,0,16,1
2,32,0,2,19,22,15,19,6,17,12,...,9,24,14,14,0,19,1,0,8,2
3,38,0,0,65,46,31,33,22,44,42,...,31,39,93,60,0,26,6,0,15,3
4,21,0,1,9,16,4,12,0,18,13,...,6,13,8,11,0,12,2,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,5,0,0,6,15,5,4,0,6,17,...,7,10,11,8,0,7,1,0,3,2267
2268,2,0,1,3,2,2,4,2,4,7,...,1,3,2,1,0,11,0,0,5,2268
2269,55,0,27,38,38,27,39,29,20,20,...,27,36,46,50,0,46,6,0,22,2269
2270,78,0,1,24,10,12,24,5,15,30,...,12,7,28,12,0,36,3,0,9,2270


In [29]:
X_test = df_counts_test.set_index("id")
X_test

Unnamed: 0_level_0,A,B,C,D,E,F,G,H,I,K,...,P,Q,R,S,T,U,V,W,X,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,14,0,0,0,0,1,2,3,1,21,...,4,0,27,14,1,0,5,0,0,0
1,108,0,9,41,45,19,45,13,24,13,...,34,33,63,39,39,0,67,8,0,16
2,32,0,2,19,22,15,19,6,17,12,...,9,9,24,14,14,0,19,1,0,8
3,38,0,0,65,46,31,33,22,44,42,...,50,31,39,93,60,0,26,6,0,15
4,21,0,1,9,16,4,12,0,18,13,...,7,6,13,8,11,0,12,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,5,0,0,6,15,5,4,0,6,17,...,0,7,10,11,8,0,7,1,0,3
2268,2,0,1,3,2,2,4,2,4,7,...,0,1,3,2,1,0,11,0,0,5
2269,55,0,27,38,38,27,39,29,20,20,...,37,27,36,46,50,0,46,6,0,22
2270,78,0,1,24,10,12,24,5,15,30,...,18,12,7,28,12,0,36,3,0,9


In [30]:
y_test = df_test['label']
y_test

0       1
1       1
2       1
3       1
4       1
       ..
2267    0
2268    0
2269    0
2270    0
2271    0
Name: label, Length: 2272, dtype: int64

In [31]:
# choosing the best model based on the xval
opt_model = models[xval_accuracy.argmax()]
y_pred_test = opt_model.predict(X_test).round().astype(int)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
test_accuracy



0.6694542253521126

Given the simplicity of BoW embedding, accuracy of 66.9% is not bad!