# DNA-binding Protein recognition using the BOW

## All necessary imports

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

RND_SEED = 42  # for reproducibility
np.random.seed(RND_SEED)

## Reading the DBP data

In [13]:
# Loading data
df = pd.read_csv('../data_instadeep/dna_binding/train.csv')
df = df.sample(frac=1, random_state=RND_SEED)
print(df.shape)

(14189, 4)


In [14]:
df.head(5)

Unnamed: 0,code,sequence,label,origin
5646,Q9YEZ5,MADARFYFSDARTWRYMVASIEKIIEEGVFVATGEGLSLRALDTSH...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2012,Q24762,MSAAGDAGAGAANGSNNVAVVQATVSVSGNISVGDGSPNNNNNNNA...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
14156,B2V652,MEVITNPGQMQTLMLSLKKQGKKIGFVPTMGYLHEGHLSLIRCSKK...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
8852,Q5HWG0,MLEGIVRESIGRKAAKALKRDGYLIANIYGKGLENINAAFKVNEFI...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
12836,Q492D3,MIHGIGIDIVDIRKIKKIITHSGDKLATRILSKSEWKIYKNKKHPV...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


In [15]:
print(f'{"Column":15s} # of unique values')
print(f'{"-"*40:60s}')
for col in df:
    print(f'{col:15s} {len(df[col].unique())}')

Column          # of unique values
----------------------------------------                    
code            14189
sequence        14016
label           2
origin          1


In [16]:
# Is the data balanced?
df['label'].value_counts(normalize=True)

1    0.502431
0    0.497569
Name: label, dtype: float64

## Bag Of Words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
set_chars = set()
for i in list(df["sequence"]):
    set_chars = set_chars | set(i)
list_chars = list(set_chars)
list_chars.sort()
' '.join(list_chars)

'A B C D E F G H I K L M N O P Q R S T U V W X Y'

In [19]:
from collections import Counter

In [20]:
df.head(10)

Unnamed: 0,code,sequence,label,origin
5646,Q9YEZ5,MADARFYFSDARTWRYMVASIEKIIEEGVFVATGEGLSLRALDTSH...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
2012,Q24762,MSAAGDAGAGAANGSNNVAVVQATVSVSGNISVGDGSPNNNNNNNA...,1,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
14156,B2V652,MEVITNPGQMQTLMLSLKKQGKKIGFVPTMGYLHEGHLSLIRCSKK...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
8852,Q5HWG0,MLEGIVRESIGRKAAKALKRDGYLIANIYGKGLENINAAFKVNEFI...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
12836,Q492D3,MIHGIGIDIVDIRKIKKIITHSGDKLATRILSKSEWKIYKNKKHPV...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
9494,Q2L2F7,MDLKLLNDQGQAATFSAPDTIFGRDFNEALVHQIVVAYQANARSGN...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
7918,Q3J2Y1,MSFTLAIVGRPNVGKSTLFNRLVGKRLALVDDQPGVTRDLREGDAR...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
11107,P08075,MKALVLAGGTGTRLRPITHTSAKQLVPVANKPVLFYGLEAIRAAGI...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
11600,A4QJT5,MSRYRGPRFKKIRRLGALPGLTSKRPKAGSDLRNQSRSVKKSQYRI...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...
7551,Q87RN0,MEQKIVNIGDIQVANDKPFTLFAGMNVLESRDLAMQICEHYVKVTD...,0,https://github.com/hfuulgb/PDB-Fusion/tree/mai...


In [21]:
counts = [[Counter(i)[list_chars[j]] for j, c in enumerate(list_chars)] for i in list(df["sequence"])]
counts

[[23,
  0,
  0,
  14,
  28,
  12,
  14,
  2,
  9,
  8,
  26,
  8,
  1,
  0,
  7,
  4,
  20,
  25,
  12,
  0,
  24,
  1,
  0,
  11],
 [77,
  0,
  2,
  20,
  33,
  9,
  29,
  19,
  24,
  29,
  34,
  10,
  33,
  0,
  24,
  132,
  19,
  40,
  32,
  0,
  40,
  2,
  0,
  10],
 [11,
  0,
  5,
  18,
  21,
  16,
  17,
  4,
  19,
  25,
  21,
  8,
  15,
  0,
  13,
  11,
  13,
  15,
  9,
  0,
  28,
  0,
  0,
  11],
 [16,
  0,
  1,
  11,
  9,
  5,
  13,
  1,
  13,
  22,
  16,
  3,
  7,
  0,
  4,
  4,
  9,
  6,
  7,
  0,
  27,
  0,
  0,
  4],
 [10, 0, 2, 5, 4, 7, 7, 6, 15, 17, 12, 3, 6, 0, 2, 2, 4, 8, 7, 0, 5, 1, 0, 2],
 [23,
  0,
  0,
  11,
  11,
  6,
  12,
  4,
  10,
  16,
  20,
  4,
  9,
  0,
  10,
  9,
  15,
  13,
  11,
  0,
  14,
  2,
  0,
  5],
 [45,
  0,
  2,
  37,
  36,
  15,
  41,
  12,
  27,
  25,
  49,
  12,
  11,
  0,
  28,
  9,
  50,
  15,
  25,
  0,
  39,
  5,
  0,
  4],
 [31,
  0,
  3,
  23,
  25,
  10,
  33,
  8,
  21,
  8,
  33,
  5,
  7,
  0,
  17,
  6,
  26,
  27,
  18,
  0,
  45,

In [22]:
df_counts = pd.DataFrame(data=counts, columns=list_chars)
df_counts["id"] = list(df["code"])
df_counts

Unnamed: 0,A,B,C,D,E,F,G,H,I,K,...,Q,R,S,T,U,V,W,X,Y,id
0,23,0,0,14,28,12,14,2,9,8,...,4,20,25,12,0,24,1,0,11,Q9YEZ5
1,77,0,2,20,33,9,29,19,24,29,...,132,19,40,32,0,40,2,0,10,Q24762
2,11,0,5,18,21,16,17,4,19,25,...,11,13,15,9,0,28,0,0,11,B2V652
3,16,0,1,11,9,5,13,1,13,22,...,4,9,6,7,0,27,0,0,4,Q5HWG0
4,10,0,2,5,4,7,7,6,15,17,...,2,4,8,7,0,5,1,0,2,Q492D3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14184,36,0,3,19,22,9,22,6,33,24,...,10,18,21,23,0,20,0,0,6,P37947
14185,13,0,5,13,14,6,15,1,11,9,...,11,16,10,9,0,22,4,0,10,B7UFU5
14186,31,0,6,21,20,11,16,12,24,21,...,6,16,18,17,0,18,2,0,13,P37517
14187,18,0,0,11,21,12,23,2,21,19,...,8,11,23,19,0,24,0,0,9,P37551


## Sequence Classification using Deep Learning

In [23]:
X = df_counts.set_index("id")
X

Unnamed: 0_level_0,A,B,C,D,E,F,G,H,I,K,...,P,Q,R,S,T,U,V,W,X,Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9YEZ5,23,0,0,14,28,12,14,2,9,8,...,7,4,20,25,12,0,24,1,0,11
Q24762,77,0,2,20,33,9,29,19,24,29,...,24,132,19,40,32,0,40,2,0,10
B2V652,11,0,5,18,21,16,17,4,19,25,...,13,11,13,15,9,0,28,0,0,11
Q5HWG0,16,0,1,11,9,5,13,1,13,22,...,4,4,9,6,7,0,27,0,0,4
Q492D3,10,0,2,5,4,7,7,6,15,17,...,2,2,4,8,7,0,5,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P37947,36,0,3,19,22,9,22,6,33,24,...,16,10,18,21,23,0,20,0,0,6
B7UFU5,13,0,5,13,14,6,15,1,11,9,...,14,11,16,10,9,0,22,4,0,10
P37517,31,0,6,21,20,11,16,12,24,21,...,12,6,16,18,17,0,18,2,0,13
P37551,18,0,0,11,21,12,23,2,21,19,...,9,8,11,23,19,0,24,0,0,9


In [24]:
y = np.array(df['label'])

We will perform a 10-fold cross-validation to measure the performance of the classification model.

In [25]:
kfold = 10
random_state = 1

xval_accuracy = np.zeros(kfold)
xval_recall = np.zeros(kfold)
xval_f1 = np.zeros(kfold)
skf = KFold(n_splits = kfold, shuffle = True, random_state = random_state)
k = 0
epochs = 50
batch_size = 128

models = []
for train_index, xval_index in skf.split(X, y):
    X_train, X_xval = X.iloc[train_index], X.iloc[xval_index]
    y_train, y_xval = y[train_index], y[xval_index]
    
    model = Sequential()
    model.add(Dense(64, input_shape = (X_train.shape[1],), activation='relu')) 
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
    models.append(model)
    
    y_pred = model.predict(X_xval).round().astype(int)

    xval_accuracy[k] = sklearn.metrics.accuracy_score(y_xval, y_pred)
    xval_recall[k] = sklearn.metrics.recall_score(y_xval, y_pred)
    xval_f1[k] = sklearn.metrics.f1_score(y_xval, y_pred)
    k+=1
    
print ('Average accuracy score', np.mean(xval_accuracy))
print ('Average recall score', np.mean(xval_recall))
print ('Average f1 score', np.mean(xval_f1))

2022-10-05 23:42:13.497287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Average accuracy score 0.7913880829484202
Average recall score 0.8459900610638303
Average f1 score 0.8026398905285813


Given the simplicity of BoW embedding, accuracy of 79.1% is not bad!