In [1]:
import warnings
warnings.filterwarnings('ignore')

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio.PDB.vectors import calc_dihedral

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, math
import random
import nltk
import urllib.request, urllib.error

def degrees(rad):
    return (rad * 180) / math.pi

def phi_psi_omega_to_abego(phi, psi, omega):
    #if np.isnan(psi): return ‘O’
    #if np.isnan(omega): omega = 180
    #if np.isnan(phi): phi=90
    
    if np.isnan(phi) or np.isnan(psi) or np.isnan(omega): 
        return 'X'
    
    if abs(omega) < 90:
        return 'O'
    
    elif phi > 0:
        if -100.0 <= psi < 100:
            return 'G'
        else:
            return 'E'
        
    else:
        if -75.0 <= psi < 50:
            return 'A'
        else:
            return 'B'
        
    return 'X'

In [345]:
df = pd.read_csv('2018-06-06-ss.cleaned.csv') # read in CSV

# first 200 pdb ids
input_pdbs = df['pdb_id'].values.T

#input_pdbs

random_pdbs = [input_pdbs[random.randrange(1, len(input_pdbs) - 1, 1)] for i in range(1000)]
random_pdbs

['2GSG',
 '3VXS',
 '3PQ4',
 '5JMW',
 '2C9D',
 '3J54',
 '6EZN',
 '4EAK',
 '3T87',
 '1VNF',
 '1XSM',
 '4D0T',
 '1VWU',
 '3RY1',
 '4PBU',
 '3ZE0',
 '1W4W',
 '5UDG',
 '3V17',
 '1UIF',
 '3TBW',
 '5KZF',
 '4Y8U',
 '4I6W',
 '4ARE',
 '5B7O',
 '2WWB',
 '2FW1',
 '3CD6',
 '4R7Q',
 '1FQ0',
 '2W2W',
 '1HWK',
 '4A04',
 '4ZCC',
 '4QFS',
 '5JJ7',
 '5DXM',
 '3JB9',
 '1LE6',
 '5KLK',
 '2QMR',
 '3HE8',
 '1Q3B',
 '3AU6',
 '1AU1',
 '4LYE',
 '3QZU',
 '5KSW',
 '3CS1',
 '5H6T',
 '1J0H',
 '4L4Q',
 '4S23',
 '4QMG',
 '2MK9',
 '4QJT',
 '3G97',
 '2QJG',
 '5HI9',
 '2UX0',
 '2IXC',
 '1WOX',
 '1UP2',
 '4U8U',
 '3UT5',
 '2M16',
 '3N8N',
 '4A8S',
 '4HI1',
 '4QVQ',
 '5VVH',
 '3BBX',
 '4P5H',
 '4ALR',
 '5HL7',
 '1X8B',
 '3WDS',
 '5C66',
 '4CZ7',
 '4Q3W',
 '5I96',
 '3BMO',
 '2O6T',
 '3NCY',
 '4LHV',
 '4ABR',
 '3HXJ',
 '3UOK',
 '4X6X',
 '5UT2',
 '4HHC',
 '5JRV',
 '1TU6',
 '5TJ9',
 '5Y9J',
 '2YQV',
 '5QAJ',
 '4HPT',
 '3DV0',
 '1MNF',
 '3ED0',
 '3T3C',
 '1CPJ',
 '3OB0',
 '4ZQC',
 '2OQ0',
 '3HWJ',
 '4X01',
 '5GLI',
 '1F2H',
 

In [347]:
files = []

for pdb in random_pdbs:
    
    try:
        if (pdb + ".pdb") not in os.listdir((os.getcwd() + '/pdb_files/')):
            filename = urllib.request.urlretrieve('https://files.rcsb.org/download/{}.pdb'.format(pdb), pdb + '.pdb')
            path = os.path.join(os.getcwd(), pdb + '.pdb')
            os.rename(path, os.getcwd() + '/pdb_files/' + pdb + '.pdb') 
            files.append(filename[0])
        else:
            files.append(pdb + ".pdb")
        
    except urllib.error.HTTPError:
        continue    
        
files = set(files)

In [348]:
# output set
abegopatterns = []

# input set
seqs = []

cwd = os.getcwd()

for file in files:
        
    phi_psi = []
    nres = []
    
    ran = False
    repeat = True
        
    structure = PDBParser(QUIET = True).get_structure(cwd + "/pdb_files/" + file, cwd + "/pdb_files/" + file)

    for chain in structure:

        polypeptides = PPBuilder().build_peptides(chain)

        for polypeptide in polypeptides:

            ran = True
            
            # a list of polypeptide chain lengths
            nres.append(len(polypeptide))
            
            if len(nres) > 1:
                nres[-1] = nres[-1] + nres[-2]
            
            phi_psi += polypeptide.get_phi_psi_list()  
            
            if polypeptide.get_sequence() not in seqs:
                repeat = False # don't want duplicate sequences
                seqs.append(polypeptide.get_sequence())

            break # only the first subunit for now
            
        break
    
    if not(ran) or repeat:
        continue
        
    phi_psi_omega = []

    residues = [res for res in structure.get_residues()]

    for i in range(len(residues) - 1):

        if (i + 1) in nres:
            omega = None
            break

        else:
            try:
                a1 = residues[i]['CA'].get_vector()
                a2 = residues[i]['C'].get_vector()
                a3 = residues[i + 1]['N'].get_vector()
                a4 = residues[i + 1]['CA'].get_vector()
                
                omega = calc_dihedral(a1,a2,a3,a4)
                
                phi_psi_omega.append((phi_psi[i][0], phi_psi[i][1], omega))
                
            except KeyError:
                # phi_psi_omega.append((phi_psi[i][0], phi_psi[i][1], None))
                # seqs.pop()
                continue
    
    # last triplet tuple
    phi_psi_omega.append((phi_psi[-1][0], phi_psi[-1][1], None))
    
    # ABEGO str
    abego = ""

    for phi, psi, omega in phi_psi_omega: 
        if phi != None and psi != None and omega != None:
            abego += phi_psi_omega_to_abego(degrees(phi), degrees(psi), degrees(omega))
        
    abegopatterns.append(abego)

In [349]:
print(len(seqs), len(abegopatterns))

for i in range(len(seqs)):
    seqs[i] = str(seqs[i])

def seq2ngrams(seqs, n=3):
    return np.array([[seq[i:i+n] for i in range(len(seq))] for seq in seqs])

input_grams = seq2ngrams(seqs)

908 908


In [350]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import hashing_trick
from tensorflow.keras.utils import to_categorical

# encoder, turns sequence into a fixed vector of numbers
maxLength = max(len(seq) for seq in seqs) 

tokenizer_encoder = Tokenizer() # Tokenizer Class Instance

tokenizer_encoder.fit_on_texts(input_grams) # tokenize the input_grams, updates internal unique vocabulary

input_data = tokenizer_encoder.texts_to_sequences(input_grams) # assigns the text a number

input_data = pad_sequences(input_data, maxlen=maxLength, padding='post')

# decoder
tokenizer_decoder = Tokenizer(char_level = True) # every character will be treated as a token because it's ABEGO

tokenizer_decoder.fit_on_texts(abegopatterns) 

target_data = tokenizer_decoder.texts_to_sequences(abegopatterns)

target_data = pad_sequences(target_data, maxlen=maxLength, padding='post')

target_data = to_categorical(target_data) # oneHotEncoder

input_data.shape, target_data.shape

((908, 1496), (908, 1496, 6))

In [None]:
letters = np.array([[letter for letter in abego if (letter != None)] for abego in abegopatterns])
df = pd.DataFrame(columns=range(len(letters)))
for i in range(len(letters)):
    df[i] = pd.Series(letters[i])
    
cat_encoder = OneHotEncoder()
cat_encoder.fit(df[[0]])

cat_encoder.categories_
practice = []
for i in range(len(letters)):
    practice.append(cat_encoder.transform(df[[i]]).toarray())

In [None]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
abego_cat_1hot = cat_encoder.fit_transform(df)
cat_encoder.categories_

In [375]:
maxlen_seq = maxLength

n_words = len(tokenizer_encoder.word_index) + 1 # Number of Possible Amino Acids

n_tags = len(tokenizer_decoder.word_index) + 1 # Possible ABEGO Patterns

new_model = keras.Sequential([
    keras.layers.InputLayer(input_shape=(maxlen_seq,)),
    keras.layers.Embedding(input_dim = n_words, output_dim = maxlen_seq, input_length = maxlen_seq), # word vectors in (maxLength) number of dimensions
    keras.layers.Bidirectional(keras.layers.LSTM(units=100, return_sequences=True)),
    keras.layers.TimeDistributed(keras.layers.Dense(n_tags, activation="softmax"))
#   keras.layers.Dense(12000, activation="relu"),
#   keras.layers.Embedding(n_words, 750, input_length = 32)
#   keras.layers.Dense(1, input_shape=(5,), activation="softmax")
])

new_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1496, 1496)        12342000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1496, 200)         1277600   
_________________________________________________________________
time_distributed_2 (TimeDist (None, 1496, 6)           1206      
Total params: 13,620,806
Trainable params: 13,620,806
Non-trainable params: 0
_________________________________________________________________


In [376]:
from sklearn.model_selection import train_test_split

# model.compile defines the loss function, optimizer, and metrics
# first metric is Keras provided, second metric is custom metric
new_model.compile(optimizer="RMSprop", loss="categorical_crossentropy", metrics = ["accuracy"]) 

X_train, X_test, y_train, y_test = train_test_split(input_data, target_data, test_size = .4, random_state=0)

seq_train, seq_test, target_train, target_test = train_test_split(seqs, abegopatterns, test_size=.4, random_state=0)

new_model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a67b4f10>

In [377]:
reverse_decoder_index = {value:key for key, value in tokenizer_decoder.word_index.items()}

In [378]:
test_predictions = new_model.predict(X_test)

In [379]:
random_list = [random.randrange(1, len(test_predictions) - 1, 1) for i in range(5)]

for i in random_list:
    pred = ''
    for letter in test_predictions[i]:
        if np.argmax(letter) != 0:
            pred += reverse_decoder_index[np.argmax(letter)]

    test = ''
    for letter in y_test[i]:
        if np.argmax(letter) != 0:
            test += reverse_decoder_index[np.argmax(letter)]

    print("Sequence: \n", seq_test[i].upper())
    print("Actual: \n", test.upper())
    print("Predicted: \n", pred.upper())
    
    print("\n")

Sequence: 
 GEYGQRFMWLWNKIHDPANGYFNQDGIPYHSVETLICEAPDYGHLTTSEAFSYYVWLEAVYGKLTGDWSKFKTAWDTLEKYMIPSAEDQPM
Actual: 
 AAAAAAAAAAAAAABAAAGABBAAGBBBABABABBBAABABAABBBAAAAAAAAAAAAAAAAAAGBAAAAAAAAAAAAAAABBBAAABB
Predicted: 
 BAABABBBBBBBBBBABBBBBBBBBBBBBBBBAAAABBBAABBBBBAAAAABBBBBBABBBBBBBBBBBBBBABBAAAAAAAAAAAA


Sequence: 
 ADTCYNDVALDCGITSNSLALPRCNAVYGEYGSHGNVATELQAYAKLHLERSYDYLLSAAYFNNYQTNRAGFSKLFKKLSDEAWSKTIDIIKHVTKRGDKMNFDQHSTMKTERKNYTAENHELEALAKALDTQKELAERAFYIHREATRNSQHLHDPEIAQYLEEEFIEDHAEKIRTLAGHTSDLKKFITANNGHDLSLALYVFDEYLQKTV
Actual: 
 BBAAAAAAAAABAABABBAGAABBABBGGBGABEAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAGBAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBBBBAABBBABBBBBBBBBABBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAABBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABGGGAAAAAAAAAAAAAAAAA
Predicted: 
 BBBBBBBBBBBBBBBBABBBBBBBBBBBBBBBBBBBBAAAAAAAAAAAAAAAAAAABAAABABAAAAAAAAAAAAAABBAAAAAAAAAAAABBBABBBBBBBBBBBBBABAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAABBAABABBAAAABAAAAAAAAAAAAAAAAAAAAAAAABBBAAAAABBAAABBAAAAAABBBAAAABB


Sequence: 
 MKTTPDILDQ

In [26]:
train_predictions = new_model.predict(X_train) # training set predictions

In [98]:
random_list = [random.randrange(1, len(train_predictions) - 1, 1) for i in range(5)]

for i in random_list:
    pred = ''
    for letter in train_predictions[i]:
        if np.argmax(letter) != 0:
            pred += reverse_decoder_index[np.argmax(letter)]

    train = ''
    for letter in y_train[i]:
        if np.argmax(letter) != 0:
            train += reverse_decoder_index[np.argmax(letter)]

    print("Sequence: \n", seq_train[i].upper())
    print("Actual: \n", train.upper())
    print("Predicted: \n", pred.upper())
    
    print("\n")

Sequence: 
 PSGVEGAAFQSRLPHDRMTSQEAACFPDIISGPQQTQKVFLFIRNRTLQLWLDNPKIQLTFEATLQQLEAPYNSDTVLVHRVHSYLERHGLINFGIYKRIKPLPTKKTGKVIIIGSGVSGLAAARQLQSFGMDVTLLEARDRVGGRVATFRKGNYVADLGAMVVTGLGGNPMAVVSKQVNMELAKIKQKCPLYEANGQAVPKEKDEMVEQEFNRLLEATSYLSHQLDFNVLNNKPVSLGQALEVVIQLQEKHVKDEQIEHWKKIVKTQEELKELLNKMVNLKEKIKELHQQYKEASEVKPPRDITAEFLVKSKHRDLTALCKEYDELAETQGKLEEKLQELEANPPSDVYLSSRDRQILDWHFANLEFANATPLSTLSLKHWDQDDDFEFTGSHLTVRNGYSCVPVALAEGLDIKLNTAVRQVRYTASGCEVIAVNTRSTSQTFIYKCDAVLCTLPLGVLKQQPPAVQFVPPLPEWKTSAVQRMGFGNLNKVVLCFDRVFWDPSVNLFGHVGSTTASRGELFLFWNLYKAPILLALVAGEAAGIMENISDDVIVGRCLAILKGIFGSSAVPQPKETVVSRWRADPWARGSYSYVAAGSSGNDYDLMAQPITPGPSIPGAPQPIPRLFFAGEHTIRNYPATVHGALLSGLREAGRIADQFLGAMYTL
Actual: 
 BEAAAAAAAAGBBAABBBAAAAAABAAAAABBAAAAAAAAAAAAAAAAAAAABAABBBBAAAAAAABBOAAAABAAAAAAAAAAAAAAGABGBEBBBBABBBBABBBEBBBBBEABAAAAAAAAAAAAAGBBBBBBBABABBGEABBBBBBEABBBBABBABBBEABGBAAAAAAAABBBBBBBBBABBBBBBAAGBBBBAAAAAAAAAAAAAAAAAAAAAAAAAGBABBGGBBBBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBOBBBAAAAAAAAA

In [129]:
# confusion matrix and frequency of ABEGO for all sequences
from sklearn.metrics import confusion_matrix

np.set_printoptions(suppress=True) # suppress scientific notation

conf_matrix = np.zeros((5, 5))

final_counter = Counter()

for i in range(len(test_predictions)):
    
    pred = ''
    for letter in test_predictions[i]:
        if np.argmax(letter) != 0:
            pred += reverse_decoder_index[np.argmax(letter)]

    test = ''
    for letter in y_test[i]:
        if np.argmax(letter) != 0:
            test += reverse_decoder_index[np.argmax(letter)]
    
    temp_counter = Counter(pred) + Counter(test)
    final_counter = final_counter + temp_counter

    predicted = [char for char in pred]  

    actual = [char for char in test]

    if (len(predicted) == len(actual)):
        res = confusion_matrix(actual, predicted, labels=["b", "a", "g", "e", "o"])
        conf_matrix = conf_matrix + res

In [130]:
final_counter

Counter({'b': 194750, 'a': 183583, 'g': 18220, 'e': 4573, 'o': 760})

In [96]:
conf_matrix

array([[72182., 12563.,   905.,   186.,    12.],
       [15731., 66634.,   882.,    87.,     3.],
       [ 1874.,  1710.,  5201.,   119.,     0.],
       [  934.,   384.,   640.,   887.,     0.],
       [  361.,    51.,    12.,     6.,   115.]])

In [122]:
df_cm = pd.DataFrame(conf_matrix, ["b", "a", "g", "e", "o"], ["b", "a", "g", "e", "o"])
df_cm

Unnamed: 0,b,a,g,e,o
b,72182.0,12563.0,905.0,186.0,12.0
a,15731.0,66634.0,882.0,87.0,3.0
g,1874.0,1710.0,5201.0,119.0,0.0
e,934.0,384.0,640.0,887.0,0.0
o,361.0,51.0,12.0,6.0,115.0


In [131]:
# see the similarities between test and train set
# hamming distance possibly
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

0.6842105263157895

In [326]:
test_pdb = df['pdb_id'][-10000:]
test_pdb.values[0]

'4B3H'

In [327]:
if (test_pdb.values[0] + ".pdb") not in os.listdir((os.getcwd() + '/pdb_files/')):
            filename = urllib.request.urlretrieve('https://files.rcsb.org/download/{}.pdb'.format(test_pdb.values[0]), 
                                                  test_pdb.values[0] + '.pdb')
            path = os.path.join(os.getcwd(), test_pdb.values[0] + '.pdb')
            # os.rename(path, os.getcwd() + '/pdb_files/' + test_pdb + '.pdb') 

In [328]:
test_structure = PDBParser(QUIET = True).get_structure(test_pdb.values[0] + ".pdb", test_pdb.values[0] + ".pdb")

In [329]:
phi_psi = []
nres = []

for chain in test_structure:
    
    polypeptides = PPBuilder().build_peptides(chain)

    for polypeptide in polypeptides:

        # a list of polypeptide chain lengths
        nres.append(len(polypeptide))

        phi_psi += polypeptide.get_phi_psi_list()  

        test_seq = polypeptide.get_sequence()

        break # only the first subunit for now
            
    break
        
phi_psi_omega = []

residues = [res for res in test_structure.get_residues()]

for i in range(len(residues) - 1):

    if (i + 1) in nres:
        omega = None
        break

    else:
        try:
            a1 = residues[i]['CA'].get_vector()
            a2 = residues[i]['C'].get_vector()
            a3 = residues[i + 1]['N'].get_vector()
            a4 = residues[i + 1]['CA'].get_vector()

            omega = calc_dihedral(a1,a2,a3,a4)

            phi_psi_omega.append((phi_psi[i][0], phi_psi[i][1], omega))

        except KeyError:
            continue

# last triplet tuple
phi_psi_omega.append((phi_psi[-1][0], phi_psi[-1][1], None))

# ABEGO str
test_abego = ""

for phi, psi, omega in phi_psi_omega: 
    if phi != None and psi != None and omega != None:
        test_abego += phi_psi_omega_to_abego(degrees(phi), degrees(psi), degrees(omega))

In [330]:
test_seq = str(test_seq)

In [331]:
test_seq, test_abego

('SSHHHHHHS', 'BABABAB')

In [332]:
test_seq = [test_seq]

In [333]:
test_input_data = seq2ngrams(test_seq)

In [334]:
input_test_data = tokenizer_encoder.texts_to_sequences([list(test_input_data[0])]) # assigns the text a number

input_test_data = pad_sequences(input_test_data, maxlen=maxLength, padding='post')

In [335]:
input_test_data

array([[3374, 7155, 1390, ...,    0,    0,    0]], dtype=int32)

In [336]:
sample_prediction = new_model.predict(input_test_data)

In [337]:
test_pred = ''
for letter in sample_prediction[0]:
    if np.argmax(letter) != 0:
        test_pred += reverse_decoder_index[np.argmax(letter)]

In [338]:
test_pred

'bbbbbbb'

In [339]:
test_abego

'BABABAB'

In [340]:
similar(test_abego.lower(), test_pred)

0.5714285714285714

In [None]:
# get the probablities of each ABEGO 