# Prototype Pipeline for PSSM network (cont'd)
The aim of this bare-bones pipeline is to be the prototype pipeline for retraining jnet in python. It uses only PSSM information, leaving out the HMM profiles for now. Reason PSSM profiles are used is because PSSM profiles are readily available. HMM profiles not available in retr231 folder.
The following list outlines the key steps in the pipeline:

### First notebook (PSSM_parse.ipynb)
1. load PSSM matrices of training and blind-test sets (DONE)

2. incorporate DSSP information of training and blind-test set (DONE)

### **Second notebook (PSSM_patternify.ipynb)**

3. Generate patterns from PSSM profiles using sliding window

### Third notebook (PSSM_linearize.ipynb)

4. linearize patterns

### Fourth notebook (PSSM_net.ipynb)

5. write and train ML model

6. make predictions on blind-test set

7. score accuracy

Import core modules

In [1]:
import pandas as pd
import numpy as np
import pickle

Load in the pssm list of dataframes which was previously saved as pickle objects

In [2]:
train_dir = '/homes/2472402/outputs/train/'
# test_dir = '/homes/2472402/outputs/test/'
train_in = train_dir + 'pssm_profiles_train.pkl'
# test_in = test_dir + 'pssm_profiles_test.pkl'

# read the written files
with open(train_in, 'rb') as f_read:
    pssm_profiles_train = pickle.load(f_read)

pssm_profiles_train[0] # look at first train profile for sanity check

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
R,0.119203,0.993307,0.268941,0.119203,0.017986,0.952574,0.500000,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.047426,0.268941,0.119203,0.017986,0.047426,0.047426,-
Q,0.047426,0.268941,0.268941,0.880797,0.006693,0.500000,0.952574,0.047426,0.999665,0.006693,...,0.731059,0.047426,0.047426,0.047426,0.119203,0.047426,0.017986,0.268941,0.017986,-
F,0.017986,0.006693,0.006693,0.002473,0.017986,0.006693,0.006693,0.006693,0.047426,0.119203,...,0.006693,0.119203,0.999665,0.002473,0.017986,0.017986,0.999877,0.731059,0.047426,-
G,0.268941,0.017986,0.119203,0.047426,0.017986,0.017986,0.017986,0.999089,0.017986,0.002473,...,0.047426,0.006693,0.006693,0.017986,0.119203,0.047426,0.017986,0.006693,0.006693,-
H,0.268941,0.268941,0.993307,0.982014,0.006693,0.500000,0.731059,0.268941,0.993307,0.006693,...,0.119203,0.017986,0.006693,0.047426,0.268941,0.268941,0.006693,0.047426,0.017986,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K,0.047426,0.268941,0.993307,0.119203,0.006693,0.268941,0.119203,0.731059,0.731059,0.047426,...,0.982014,0.880797,0.017986,0.017986,0.500000,0.119203,0.017986,0.731059,0.017986,H
R,0.119203,0.997527,0.119203,0.047426,0.017986,0.268941,0.119203,0.119203,0.119203,0.006693,...,0.880797,0.047426,0.006693,0.017986,0.952574,0.500000,0.006693,0.017986,0.017986,-
K,0.047426,0.731059,0.119203,0.500000,0.006693,0.500000,0.500000,0.017986,0.047426,0.006693,...,0.999089,0.047426,0.006693,0.047426,0.119203,0.047426,0.006693,0.017986,0.017986,-
N,0.047426,0.119203,0.999665,0.952574,0.006693,0.119203,0.268941,0.119203,0.268941,0.006693,...,0.268941,0.017986,0.006693,0.017986,0.268941,0.119203,0.002473,0.017986,0.006693,-


Repeat for test set

In [182]:
with open(test_in, 'rb') as f_read:
    pssm_profiles_test = pickle.load(f_read)

pssm_profiles_test[0] # look at first test profile for sanity check

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
Q,0.119203,0.731059,0.268941,0.268941,0.017986,0.993307,0.880797,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.119203,0.268941,0.119203,0.017986,0.047426,0.047426,-
K,0.119203,0.731059,0.880797,0.268941,0.017986,0.952574,0.500000,0.119203,0.268941,0.017986,...,0.997527,0.119203,0.017986,0.119203,0.268941,0.119203,0.017986,0.047426,0.047426,-
D,0.119203,0.500000,0.500000,0.993307,0.006693,0.952574,0.880797,0.047426,0.119203,0.017986,...,0.982014,0.047426,0.006693,0.119203,0.268941,0.119203,0.006693,0.047426,0.017986,H
L,0.119203,0.047426,0.047426,0.017986,0.047426,0.047426,0.047426,0.017986,0.047426,0.500000,...,0.047426,0.500000,0.982014,0.017986,0.119203,0.952574,0.119203,0.268941,0.268941,H
A,0.993307,0.047426,0.047426,0.047426,0.119203,0.119203,0.119203,0.268941,0.047426,0.500000,...,0.119203,0.119203,0.047426,0.119203,0.880797,0.268941,0.017986,0.047426,0.268941,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P,0.119203,0.017986,0.017986,0.047426,0.017986,0.047426,0.047426,0.017986,0.017986,0.017986,...,0.119203,0.017986,0.006693,0.999665,0.119203,0.119203,0.006693,0.017986,0.268941,-
T,0.119203,0.500000,0.731059,0.119203,0.047426,0.731059,0.119203,0.047426,0.047426,0.268941,...,0.731059,0.119203,0.047426,0.047426,0.268941,0.982014,0.017986,0.047426,0.952574,-
P,0.731059,0.119203,0.047426,0.119203,0.017986,0.119203,0.731059,0.047426,0.047426,0.017986,...,0.500000,0.047426,0.006693,0.999089,0.268941,0.119203,0.006693,0.017986,0.047426,-
P,0.119203,0.047426,0.047426,0.047426,0.017986,0.047426,0.119203,0.047426,0.047426,0.017986,...,0.119203,0.017986,0.006693,0.999665,0.119203,0.119203,0.006693,0.017986,0.017986,-


# 3 (i) Generate 17-residue patterns with sliding window for an example profile

Below is example of steps taken for one profile only:

Take a profile, add 8 rows of padding (i.e. NaN) to the profile at the top and bottom rows
This prepares the profile for sliding window operation

In [14]:
pf = pssm_profiles_train[0] # start with one profile as example

# pad dataframe with 8 empty rows before first row and after last row in the profile. this sets up sliding window operation
# each padding row will have 20 0's. No need to define dssp column, it automatically becomes 'NaN'
flank = 8
pad_np_array = np.hstack((np.repeat(np.nan,21))) # create a numpy array with 21 NaNs

# convert numpy array into pd Series so we can create dataframe based on it
pad_row = pd.Series(pad_np_array)

# create empty dataframe and add the NaN rows to it
pad_df = pd.DataFrame()
for _ in range(flank):
    pad_df = pad_df.append(pad_row, ignore_index=True)

# assign column names to this padding df - same column names as ppsm matrix
pad_df.columns = pf.columns

pad_df # look at the padded dataframe that will be added

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


Concatenate padded rows into top and bottom of profile

In [149]:
# reset index names from pf, otherwise pd.concat (next line) will throw error saying index has to be nonredundant
pf.index = range(0,len(pf.index))

pf1 = pd.concat([pad_df, pf, pad_df], ignore_index=True)

pf1.iloc[-20:,:]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
69,0.047426,0.5,0.982014,0.268941,0.017986,0.268941,0.952574,0.017986,0.119203,0.731059,...,0.952574,0.047426,0.017986,0.017986,0.268941,0.880797,0.006693,0.017986,0.268941,H
70,0.047426,0.880797,0.731059,0.5,0.006693,0.880797,0.982014,0.119203,0.952574,0.006693,...,0.952574,0.017986,0.017986,0.119203,0.119203,0.268941,0.017986,0.731059,0.017986,H
71,0.047426,0.880797,0.119203,0.119203,0.006693,0.268941,0.982014,0.017986,0.5,0.047426,...,0.952574,0.047426,0.017986,0.017986,0.268941,0.731059,0.017986,0.731059,0.268941,H
72,0.017986,0.047426,0.997527,0.268941,0.006693,0.119203,0.047426,0.017986,0.997527,0.017986,...,0.047426,0.047426,0.952574,0.006693,0.119203,0.047426,0.047426,0.993307,0.047426,H
73,0.5,0.119203,0.119203,0.5,0.017986,0.731059,0.993307,0.017986,0.982014,0.268941,...,0.268941,0.047426,0.017986,0.047426,0.268941,0.880797,0.006693,0.047426,0.268941,H
74,0.731059,0.880797,0.731059,0.119203,0.017986,0.880797,0.5,0.017986,0.119203,0.017986,...,0.880797,0.047426,0.880797,0.119203,0.119203,0.047426,0.119203,0.993307,0.047426,H
75,0.731059,0.268941,0.017986,0.017986,0.268941,0.047426,0.268941,0.119203,0.047426,0.5,...,0.047426,0.880797,0.268941,0.017986,0.731059,0.119203,0.047426,0.880797,0.119203,H
76,0.047426,0.268941,0.993307,0.119203,0.006693,0.268941,0.119203,0.731059,0.731059,0.047426,...,0.982014,0.880797,0.017986,0.017986,0.5,0.119203,0.017986,0.731059,0.017986,H
77,0.119203,0.997527,0.119203,0.047426,0.017986,0.268941,0.119203,0.119203,0.119203,0.006693,...,0.880797,0.047426,0.006693,0.017986,0.952574,0.5,0.006693,0.017986,0.017986,-
78,0.047426,0.731059,0.119203,0.5,0.006693,0.5,0.5,0.017986,0.047426,0.006693,...,0.999089,0.047426,0.006693,0.047426,0.119203,0.047426,0.006693,0.017986,0.017986,-


Carrying on from the padded profile...

Slice the padded profile into equal-dimension dataframes with shape (17,21) and store it in a list called patterns. Each pattern is a training example for the CNN

In [151]:
# Using iloc based indexing. i.e. 'location based indexing' cf label based indexing
# idx of first amino acid of sequence in padded dataframe 
seqStartIdx = flank 
# idx of last amino acid of sequence in padded dataframe
seqEndIdx = pf1.shape[0]-flank

# create a list of dataframes which we will put individual windows into. this list of patterns should span across samples
patterns = []

for seqIdx in range(seqStartIdx, seqEndIdx): # for each amino acid sequence / centroid of window
    # define start and end index of window given position of center residue
    winStartIdx = seqIdx - 8
    winEndIdx = seqIdx + 8 + 1
    # again using location based indexing with .iloc[]
    pattern = pf1.iloc[winStartIdx:winEndIdx, :]
    patterns.append(pattern)

Check that the number of patterns is the same as the sequence length (must minus 2*8 = 16 padded rows)

In [154]:
len(patterns)

73

In [156]:
pf1.shape[0]-16

73

Now repeat the above steps for the entire dataset, saving it in a list

# 3 (ii) Repeat above steps to generate windows of all sequences

Here I specify 2 functions that act on individual PSSM dataframes. One creates padding, another creates the windows from padded dataframes

Start first with the function that padds the dataframe

In [3]:
# function to add padding of #FLANK rows to the top and bottom of dataframe. flank = 8 for layer 1 (seq to struct) and flank = 9 for layer 2 (struct to struct)
def padding_operation(profile, flank):
    # create a numpy array with 21 NaNs
    pad_np_array = np.hstack((np.repeat(np.nan,21)))
    # convert numpy array into pd Series so we can create dataframe based on it
    pad_row = pd.Series(pad_np_array)
    # create empty dataframe and add the NaN rows to it
    pad_df = pd.DataFrame()
    for _ in range(flank):
        pad_df = pad_df.append(pad_row, ignore_index=True)
    # assign column names to this padding df - same column names as ppsm matrix
    pad_df.columns = profile.columns
    # create a copy of profile which i will now modify
    pf = profile.copy()
    # reset index names from pf, otherwise pd.concat (next line) will throw error saying index has to be nonredundant
    pf.index = range(0,len(pf.index))
    # add padding to above and below dataframes
    padded_pf = pd.concat([pad_df, pf, pad_df], ignore_index=True)
    return padded_pf

Now, the function to generate windows from individual dataframes

In [4]:
# function returns a list of windows (variable name patterns). flank argument is same as above. takes in the padded_df from padding_operation()
def sliding_operation(padded_pf, flank):
    # Using iloc based indexing. i.e. 'location based indexing' cf label based indexing
    # idx of first amino acid of sequence in padded dataframe 
    seqStartIdx = flank 
    # idx of last amino acid of sequence in padded dataframe
    seqEndIdx = padded_pf.shape[0]-flank
    # create a list of dataframes which we will put individual windows into. this list of patterns should span across samples
    patterns = []
    for seqIdx in range(seqStartIdx, seqEndIdx): # for each amino acid sequence / centroid of window
        # define start and end index of window given position of center residue
        winStartIdx = seqIdx - 8
        winEndIdx = seqIdx + 8 + 1
        # again using location based indexing with .iloc[]
        pattern = padded_pf.iloc[winStartIdx:winEndIdx, :]
        patterns.append(pattern)
    return patterns

Wrapper code calling two functions above on each PSSM dataframe to generate patterns from all dataframes

In [5]:
total_patterns = []
for profile in (pssm_profiles_train):
    padded_pf = padding_operation(profile, flank = 8)
    patterns = sliding_operation(padded_pf, flank = 8)
    total_patterns.append(patterns)
    
len(total_patterns)

1348

In [7]:
# save as input for structure to structure net
# skip if not runnign this to get input into layer 2 net
file = '/homes/2472402/data/patterned_train_sequences.pkl'
pickle.dump(total_patterns, open(file, 'wb'),protocol=pickle.HIGHEST_PROTOCOL)

Flatten the nested list into a single list

In [162]:
from itertools import chain
total_patterns_flattened = list(chain.from_iterable(total_patterns))

len(total_patterns_flattened)

218421

Check that each sequence in train set contributes the same number of patterns as its sequence length

In [163]:
# in each n-residue sequence, there should be n patterns generated

# length of each sequence in training dataset
num_of_rows = pd.Series([pf.shape[0] for pf in pssm_profiles_train])

# number of patterns generated from each sequence
total_patterns_len = [len(pat) for pat in total_patterns]

# check they are all the same
all(num_of_rows == total_patterns_len)

True

# 3(iii). Repeat steps in 3 (ii) for test dataset

Generate patterns from all PSSM dataframes in test set and combine into a list

In [171]:
total_patterns_test = []
for profile in (pssm_profiles_test):
    padded_pf = padding_operation(profile, flank = 8)
    patterns = sliding_operation(padded_pf, flank = 8)
    total_patterns_test.append(patterns)
    
len(total_patterns_test)

149

Flatten the nested list into a single list

In [173]:
total_patterns_test_flattened = list(chain.from_iterable(total_patterns_test))

len(total_patterns_test_flattened)

22734

Check that each sequence in test set contributes the same number of patterns as its sequence length

In [175]:
# length of each sequence in test dataset
num_of_rows_test = pd.Series([pf.shape[0] for pf in pssm_profiles_test])

# number of patterns generated from each sequence
total_patterns_len_test = [len(pat) for pat in total_patterns_test]

# check they are all the same
all(num_of_rows_test == total_patterns_len_test)

True

# 3(iv). Save patterns as pkl object

Define some functions and path variables

In [178]:
train_out = train_dir + 'patterned_train_data.pkl' # absolute filepaths with .pkl extension
test_out = test_dir + 'patterneded_test_data.pkl'

def save_patterns(list_of_patterns, out_path):
    with open(out_path, 'wb') as f_write: # create a filehandle using on provided output path for writing out the profiles. 'w' means write, 'b' stands for write in binary
        pickle.dump(list_of_patterns, f_write, pickle.HIGHEST_PROTOCOL)
    return 1

Save files as .pkl objects

In [2]:
save_patterns(total_patterns_flattened, train_out)
save_patterns(total_patterns_test_flattened, test_out)

NameError: name 'save_patterns' is not defined