# Prototype Pipeline for PSSM network (cont'd)
The aim of this bare-bones pipeline is to be the prototype pipeline for retraining jnet in python. It uses only PSSM information, leaving out the HMM profiles for now. Reason PSSM profiles are used is because PSSM profiles are readily available. HMM profiles not available in retr231 folder.
The following list outlines the key steps in the pipeline:

### ***First notebook (PSSM_parse.ipynb)***
1. load PSSM matrices of training and blind-test sets (DONE)

2. incorporate DSSP information of training and blind-test set (DONE)

### Second notebook (PSSM_patternify.ipynb)

3. Generate patterns from PSSM profiles using sliding window

### Third notebook (PSSM_linearize.ipynb)

4. linearize patterns

### Fourth notebook (PSSM_net.ipynb)

5. write and train ML model

6. make predictions on blind-test set

7. score accuracy

# 1. Load PSSMs and process into clean dataframes

Import core modules

In [1]:
import pandas as pd
import math
import os
import pickle

Specify directory of dataset for training and testing
Assumes that current working directory is '/homes/2472402/'

In [2]:
train_dir = '/cluster/gjb_lab/2472402/retr231/training/' # path to training dataset
test_dir = '/cluster/gjb_lab/2472402/retr231/blind/' # path to blind test dataset
out_dir = '/homes/2472402/outputs/' # path to write out files. WARNING - will not work if user is not 2472402 due to write permissions. modify this into your own home directory

Parse .profile files into pandas DataFrames

In [3]:
# obtain a list of all file names in directory
file_names = os.listdir(train_dir)

# save names of files with '.profile' extension
profile_names = [f for f in file_names if ('.profile' in f)]

# display first 5
profile_names[0:5]

['d1ppjg_.profile',
 'd2fpna1.profile',
 'd1x38a2.profile',
 'd3u9wa1.profile',
 'd1gd8a_.profile']

In [4]:
# also save names of fasta sequences, which will be used later as row entries
sequence_IDs = [pn[:-8] for pn in profile_names]

# display first 5
sequence_IDs[0:5]

['d1ppjg_', 'd2fpna1', 'd1x38a2', 'd3u9wa1', 'd1gd8a_']

In [5]:
# read all profiles (files with the .profile extension), collate into a list
profiles = [pd.read_csv(train_dir + pn) for pn in profile_names]

# might want to run this in future if want to keep track of sequence_ID
# profiles = [pn[:-8]: pd.read_csv(train_dir + pn) for pn in profile_names]

# store column names of PSSM matrix as a constant
column_names = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

def logistic_func(x):
    x = int(x) # cast string to int
    return 1/(1+math.exp(-1*x)) # simple case of logistic function

def process_profile(pf):
    # for each list element, take only the pssm column
    pf = pf['Last position-specific scoring matrix computed'] 
    # for each list element, remove the last 5 rows from each pssm profile, i.e. k and lambda metrics
    pf = pf[:-5]
    # for each list element, split each row using default whitespace delimiter. take only elements 0 - 20, i.e. the 20 amino acid residues
    pf = pd.DataFrame([row.split() for row in pf])
    # obtain sequence in list form - this will become the row names
    index = pf.iloc[1:,1].tolist()
    # trim away unneeded portion of dataframe, including numerical index, and cols 23 onwards
    pf = pf.iloc[1:, 2:22]
    # assign column names (the same for all PSSM profiles)
    pf.columns = column_names
    # assign row names of pf
    pf.index = index
    return pf

# perform these steps at one go, one profile at a time
profiles = [process_profile(pf) for pf in profiles]

# view the first profile as an example PSSM
profiles[0]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V
R,-2,5,-1,-2,-4,3,0,-3,-1,-4,-3,4,-2,-4,-3,-1,-2,-4,-3,-3
Q,-3,-1,-1,2,-5,0,3,-3,8,-5,-4,1,-3,-3,-3,-2,-3,-4,-1,-4
F,-4,-5,-5,-6,-4,-5,-5,-5,-3,-2,-2,-5,-2,8,-6,-4,-4,9,1,-3
G,-1,-4,-2,-3,-4,-4,-4,7,-4,-6,-6,-3,-5,-5,-4,-2,-3,-4,-5,-5
H,-1,-1,5,4,-5,0,1,-1,5,-5,-5,-2,-4,-5,-3,-1,-1,-5,-3,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K,-3,-1,5,-2,-5,-1,-2,1,1,-3,-2,4,2,-4,-4,0,-2,-4,1,-4
R,-2,6,-2,-3,-4,-1,-2,-2,-2,-5,-4,2,-3,-5,-4,3,0,-5,-4,-4
K,-3,1,-2,0,-5,0,0,-4,-3,-5,-5,7,-3,-5,-3,-2,-3,-5,-4,-4
N,-3,-2,8,3,-5,-2,-1,-2,-1,-5,-5,-1,-4,-5,-4,-1,-2,-6,-4,-5


In [7]:
pickle.dump(profiles, open('/homes/2472402/data/pssm/pssm_all.pkl', 'wb'),protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
for (profile, seq_ID) in zip(profiles, sequence_IDs):
    path = '/homes/2472402/data/pssm/' + seq_ID + '.pssm'
    profile.to_csv(path)

# Transform PSSM profiles using logistic function

In [105]:
# this is to conform to original Jnet which uses logistic regression classification

profiles = [profile.applymap(logistic_func) for profile in profiles]

profiles[0]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V
R,0.119203,0.993307,0.268941,0.119203,0.017986,0.952574,0.500000,0.047426,0.268941,0.017986,0.047426,0.982014,0.119203,0.017986,0.047426,0.268941,0.119203,0.017986,0.047426,0.047426
Q,0.047426,0.268941,0.268941,0.880797,0.006693,0.500000,0.952574,0.047426,0.999665,0.006693,0.017986,0.731059,0.047426,0.047426,0.047426,0.119203,0.047426,0.017986,0.268941,0.017986
F,0.017986,0.006693,0.006693,0.002473,0.017986,0.006693,0.006693,0.006693,0.047426,0.119203,0.119203,0.006693,0.119203,0.999665,0.002473,0.017986,0.017986,0.999877,0.731059,0.047426
G,0.268941,0.017986,0.119203,0.047426,0.017986,0.017986,0.017986,0.999089,0.017986,0.002473,0.002473,0.047426,0.006693,0.006693,0.017986,0.119203,0.047426,0.017986,0.006693,0.006693
H,0.268941,0.268941,0.993307,0.982014,0.006693,0.500000,0.731059,0.268941,0.993307,0.006693,0.006693,0.119203,0.017986,0.006693,0.047426,0.268941,0.268941,0.006693,0.047426,0.017986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K,0.047426,0.268941,0.993307,0.119203,0.006693,0.268941,0.119203,0.731059,0.731059,0.047426,0.119203,0.982014,0.880797,0.017986,0.017986,0.500000,0.119203,0.017986,0.731059,0.017986
R,0.119203,0.997527,0.119203,0.047426,0.017986,0.268941,0.119203,0.119203,0.119203,0.006693,0.017986,0.880797,0.047426,0.006693,0.017986,0.952574,0.500000,0.006693,0.017986,0.017986
K,0.047426,0.731059,0.119203,0.500000,0.006693,0.500000,0.500000,0.017986,0.047426,0.006693,0.006693,0.999089,0.047426,0.006693,0.047426,0.119203,0.047426,0.006693,0.017986,0.017986
N,0.047426,0.119203,0.999665,0.952574,0.006693,0.119203,0.268941,0.119203,0.268941,0.006693,0.006693,0.268941,0.017986,0.006693,0.017986,0.268941,0.119203,0.002473,0.017986,0.006693


# 2. Label transformed PSSM profiles with DSSP 'truths'

In [106]:
# load DSSP information for that sequence
dssps = [pd.read_csv(train_dir + sequence_ID + '.dssp') for sequence_ID in sequence_IDs]

# extract out the string from the dataframe
dssps = [dssp.iloc[0,0] for dssp in dssps]

# convert string into list of 'H', 'E' or '-' characters, 1 for each residue
dssps = [[char for char in dssp] for dssp in dssps]

# add dssp column into pssm matrix. They will stay in the same dataframe, through sliding window, until the samples are split into train and validation subsets
for i in range(0,len(profiles)):
    dssp = dssps[i]
    profiles[i]['dssp'] = dssp

# view the first profile as an example labelled PSSM
profiles[0]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
R,0.119203,0.993307,0.268941,0.119203,0.017986,0.952574,0.500000,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.047426,0.268941,0.119203,0.017986,0.047426,0.047426,-
Q,0.047426,0.268941,0.268941,0.880797,0.006693,0.500000,0.952574,0.047426,0.999665,0.006693,...,0.731059,0.047426,0.047426,0.047426,0.119203,0.047426,0.017986,0.268941,0.017986,-
F,0.017986,0.006693,0.006693,0.002473,0.017986,0.006693,0.006693,0.006693,0.047426,0.119203,...,0.006693,0.119203,0.999665,0.002473,0.017986,0.017986,0.999877,0.731059,0.047426,-
G,0.268941,0.017986,0.119203,0.047426,0.017986,0.017986,0.017986,0.999089,0.017986,0.002473,...,0.047426,0.006693,0.006693,0.017986,0.119203,0.047426,0.017986,0.006693,0.006693,-
H,0.268941,0.268941,0.993307,0.982014,0.006693,0.500000,0.731059,0.268941,0.993307,0.006693,...,0.119203,0.017986,0.006693,0.047426,0.268941,0.268941,0.006693,0.047426,0.017986,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K,0.047426,0.268941,0.993307,0.119203,0.006693,0.268941,0.119203,0.731059,0.731059,0.047426,...,0.982014,0.880797,0.017986,0.017986,0.500000,0.119203,0.017986,0.731059,0.017986,H
R,0.119203,0.997527,0.119203,0.047426,0.017986,0.268941,0.119203,0.119203,0.119203,0.006693,...,0.880797,0.047426,0.006693,0.017986,0.952574,0.500000,0.006693,0.017986,0.017986,-
K,0.047426,0.731059,0.119203,0.500000,0.006693,0.500000,0.500000,0.017986,0.047426,0.006693,...,0.999089,0.047426,0.006693,0.047426,0.119203,0.047426,0.006693,0.017986,0.017986,-
N,0.047426,0.119203,0.999665,0.952574,0.006693,0.119203,0.268941,0.119203,0.268941,0.006693,...,0.268941,0.017986,0.006693,0.017986,0.268941,0.119203,0.002473,0.017986,0.006693,-


In [107]:
# we want to do the same on test samples so writing a function would be easier

def attach_dssp_info(list_of_profiles, list_of_dssps):
    # extract out the string from the dataframe
    dssps = [dssp.iloc[0,0] for dssp in list_of_dssps]
    # convert string into list of 'H', 'E' or '-' characters, 1 for each residue
    dssps = [[char for char in dssp] for dssp in dssps]
    
    # add dssp column into pssm matrix. They will stay in the same dataframe, through sliding window, until the samples are split into train and validation subsets
    # first make a copy of list_of_profiles; we will modify the copy
    profiles = list_of_profiles.copy()
    for i in range(0,len(list_of_profiles)):
        dssp = dssps[i]
        profiles[i]['dssp'] = dssp
    # in future, need to add sanity check on profiles before returning
    return profiles


# following lines are just to test that attach_dssp_info does what it does: add dssp info to each profile (supplied as a list)
dssps_raw = [pd.read_csv(train_dir + sequence_ID + '.dssp') for sequence_ID in sequence_IDs]
profiles_2 = [pf.drop('dssp',axis=1) for pf in profiles]
profiles_2 = attach_dssp_info(profiles_2, dssps_raw)
profiles_2[0]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
R,0.119203,0.993307,0.268941,0.119203,0.017986,0.952574,0.500000,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.047426,0.268941,0.119203,0.017986,0.047426,0.047426,-
Q,0.047426,0.268941,0.268941,0.880797,0.006693,0.500000,0.952574,0.047426,0.999665,0.006693,...,0.731059,0.047426,0.047426,0.047426,0.119203,0.047426,0.017986,0.268941,0.017986,-
F,0.017986,0.006693,0.006693,0.002473,0.017986,0.006693,0.006693,0.006693,0.047426,0.119203,...,0.006693,0.119203,0.999665,0.002473,0.017986,0.017986,0.999877,0.731059,0.047426,-
G,0.268941,0.017986,0.119203,0.047426,0.017986,0.017986,0.017986,0.999089,0.017986,0.002473,...,0.047426,0.006693,0.006693,0.017986,0.119203,0.047426,0.017986,0.006693,0.006693,-
H,0.268941,0.268941,0.993307,0.982014,0.006693,0.500000,0.731059,0.268941,0.993307,0.006693,...,0.119203,0.017986,0.006693,0.047426,0.268941,0.268941,0.006693,0.047426,0.017986,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K,0.047426,0.268941,0.993307,0.119203,0.006693,0.268941,0.119203,0.731059,0.731059,0.047426,...,0.982014,0.880797,0.017986,0.017986,0.500000,0.119203,0.017986,0.731059,0.017986,H
R,0.119203,0.997527,0.119203,0.047426,0.017986,0.268941,0.119203,0.119203,0.119203,0.006693,...,0.880797,0.047426,0.006693,0.017986,0.952574,0.500000,0.006693,0.017986,0.017986,-
K,0.047426,0.731059,0.119203,0.500000,0.006693,0.500000,0.500000,0.017986,0.047426,0.006693,...,0.999089,0.047426,0.006693,0.047426,0.119203,0.047426,0.006693,0.017986,0.017986,-
N,0.047426,0.119203,0.999665,0.952574,0.006693,0.119203,0.268941,0.119203,0.268941,0.006693,...,0.268941,0.017986,0.006693,0.017986,0.268941,0.119203,0.002473,0.017986,0.006693,-


## Repeat the steps on test samples

In [110]:
# load profiles and dssp information on test samples

# get names of all files in test subset folder
file_names_test = os.listdir(test_dir)

# 
profile_names_test = [f for f in file_names_test if ('.profile' in f)]
sequence_IDs = [pn[:-8] for pn in profile_names_test]

# read in pssm profiles of test dataset
profiles_test = [pd.read_csv(test_dir + pn) for pn in profile_names_test]

# read in dssp profiles of test dataset
dssps_test = [pd.read_csv(test_dir + sequence_ID + '.dssp') for sequence_ID in sequence_IDs]

# tidy up pssm profiles using the process_profile() function defined above
profiles_test = [process_profile(pf) for pf in profiles_test]

# apply logistic funciton on log likelihoods 
profiles_test = [profile.applymap(logistic_func) for profile in profiles_test]

# add dssp information into pssm profiles, using the attach_dssp_info() function defined above
profiles_test = attach_dssp_info(profiles_test, dssps_test)

# view the first test profile
profiles_test[0]


Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
Q,0.119203,0.731059,0.268941,0.268941,0.017986,0.993307,0.880797,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.119203,0.268941,0.119203,0.017986,0.047426,0.047426,-
K,0.119203,0.731059,0.880797,0.268941,0.017986,0.952574,0.500000,0.119203,0.268941,0.017986,...,0.997527,0.119203,0.017986,0.119203,0.268941,0.119203,0.017986,0.047426,0.047426,-
D,0.119203,0.500000,0.500000,0.993307,0.006693,0.952574,0.880797,0.047426,0.119203,0.017986,...,0.982014,0.047426,0.006693,0.119203,0.268941,0.119203,0.006693,0.047426,0.017986,H
L,0.119203,0.047426,0.047426,0.017986,0.047426,0.047426,0.047426,0.017986,0.047426,0.500000,...,0.047426,0.500000,0.982014,0.017986,0.119203,0.952574,0.119203,0.268941,0.268941,H
A,0.993307,0.047426,0.047426,0.047426,0.119203,0.119203,0.119203,0.268941,0.047426,0.500000,...,0.119203,0.119203,0.047426,0.119203,0.880797,0.268941,0.017986,0.047426,0.268941,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P,0.119203,0.017986,0.017986,0.047426,0.017986,0.047426,0.047426,0.017986,0.017986,0.017986,...,0.119203,0.017986,0.006693,0.999665,0.119203,0.119203,0.006693,0.017986,0.268941,-
T,0.119203,0.500000,0.731059,0.119203,0.047426,0.731059,0.119203,0.047426,0.047426,0.268941,...,0.731059,0.119203,0.047426,0.047426,0.268941,0.982014,0.017986,0.047426,0.952574,-
P,0.731059,0.119203,0.047426,0.119203,0.017986,0.119203,0.731059,0.047426,0.047426,0.017986,...,0.500000,0.047426,0.006693,0.999089,0.268941,0.119203,0.006693,0.017986,0.047426,-
P,0.119203,0.047426,0.047426,0.047426,0.017986,0.047426,0.119203,0.047426,0.047426,0.017986,...,0.119203,0.017986,0.006693,0.999665,0.119203,0.119203,0.006693,0.017986,0.017986,-


Write out the dssp information-labelled train and test pssm profiles into csv to we can load them later

In [111]:
pssm_train = '/train/pssm_train.pkl' # root file name for labelled train subset, without extension
pssm_test = '/test/pssm_test.pkl' # root file name for labelled test subset

pssm_train_path = out_dir + pssm_train # absolute filepaths but without extension
pssm_test_path = out_dir + pssm_test

def save_profiles(list_of_profiles, out_path):
    with open(out_path, 'wb') as f_write: # create a filehandle using on provided output path for writing out the profiles. 'w' means write, 'b' stands for write in binary
        pickle.dump(list_of_profiles, f_write, pickle.HIGHEST_PROTOCOL)
    return 1


Test that files have been written properly

In [112]:
# write files using function
save_profiles(profiles, pssm_train_path)
save_profiles(profiles_test, pssm_test_path)

1

In [113]:
# read the written files
with open(pssm_train_path, 'rb') as f_read:
    pssm_profiles_train = pickle.load(f_read)

pssm_profiles_train[0]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
R,0.119203,0.993307,0.268941,0.119203,0.017986,0.952574,0.500000,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.047426,0.268941,0.119203,0.017986,0.047426,0.047426,-
Q,0.047426,0.268941,0.268941,0.880797,0.006693,0.500000,0.952574,0.047426,0.999665,0.006693,...,0.731059,0.047426,0.047426,0.047426,0.119203,0.047426,0.017986,0.268941,0.017986,-
F,0.017986,0.006693,0.006693,0.002473,0.017986,0.006693,0.006693,0.006693,0.047426,0.119203,...,0.006693,0.119203,0.999665,0.002473,0.017986,0.017986,0.999877,0.731059,0.047426,-
G,0.268941,0.017986,0.119203,0.047426,0.017986,0.017986,0.017986,0.999089,0.017986,0.002473,...,0.047426,0.006693,0.006693,0.017986,0.119203,0.047426,0.017986,0.006693,0.006693,-
H,0.268941,0.268941,0.993307,0.982014,0.006693,0.500000,0.731059,0.268941,0.993307,0.006693,...,0.119203,0.017986,0.006693,0.047426,0.268941,0.268941,0.006693,0.047426,0.017986,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K,0.047426,0.268941,0.993307,0.119203,0.006693,0.268941,0.119203,0.731059,0.731059,0.047426,...,0.982014,0.880797,0.017986,0.017986,0.500000,0.119203,0.017986,0.731059,0.017986,H
R,0.119203,0.997527,0.119203,0.047426,0.017986,0.268941,0.119203,0.119203,0.119203,0.006693,...,0.880797,0.047426,0.006693,0.017986,0.952574,0.500000,0.006693,0.017986,0.017986,-
K,0.047426,0.731059,0.119203,0.500000,0.006693,0.500000,0.500000,0.017986,0.047426,0.006693,...,0.999089,0.047426,0.006693,0.047426,0.119203,0.047426,0.006693,0.017986,0.017986,-
N,0.047426,0.119203,0.999665,0.952574,0.006693,0.119203,0.268941,0.119203,0.268941,0.006693,...,0.268941,0.017986,0.006693,0.017986,0.268941,0.119203,0.002473,0.017986,0.006693,-


In [114]:
with open(pssm_test_path, 'rb') as f_read:
    pssm_profiles_test = pickle.load(f_read)

pssm_profiles_test[0]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
Q,0.119203,0.731059,0.268941,0.268941,0.017986,0.993307,0.880797,0.047426,0.268941,0.017986,...,0.982014,0.119203,0.017986,0.119203,0.268941,0.119203,0.017986,0.047426,0.047426,-
K,0.119203,0.731059,0.880797,0.268941,0.017986,0.952574,0.500000,0.119203,0.268941,0.017986,...,0.997527,0.119203,0.017986,0.119203,0.268941,0.119203,0.017986,0.047426,0.047426,-
D,0.119203,0.500000,0.500000,0.993307,0.006693,0.952574,0.880797,0.047426,0.119203,0.017986,...,0.982014,0.047426,0.006693,0.119203,0.268941,0.119203,0.006693,0.047426,0.017986,H
L,0.119203,0.047426,0.047426,0.017986,0.047426,0.047426,0.047426,0.017986,0.047426,0.500000,...,0.047426,0.500000,0.982014,0.017986,0.119203,0.952574,0.119203,0.268941,0.268941,H
A,0.993307,0.047426,0.047426,0.047426,0.119203,0.119203,0.119203,0.268941,0.047426,0.500000,...,0.119203,0.119203,0.047426,0.119203,0.880797,0.268941,0.017986,0.047426,0.268941,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P,0.119203,0.017986,0.017986,0.047426,0.017986,0.047426,0.047426,0.017986,0.017986,0.017986,...,0.119203,0.017986,0.006693,0.999665,0.119203,0.119203,0.006693,0.017986,0.268941,-
T,0.119203,0.500000,0.731059,0.119203,0.047426,0.731059,0.119203,0.047426,0.047426,0.268941,...,0.731059,0.119203,0.047426,0.047426,0.268941,0.982014,0.017986,0.047426,0.952574,-
P,0.731059,0.119203,0.047426,0.119203,0.017986,0.119203,0.731059,0.047426,0.047426,0.017986,...,0.500000,0.047426,0.006693,0.999089,0.268941,0.119203,0.006693,0.017986,0.047426,-
P,0.119203,0.047426,0.047426,0.047426,0.017986,0.047426,0.119203,0.047426,0.047426,0.017986,...,0.119203,0.017986,0.006693,0.999665,0.119203,0.119203,0.006693,0.017986,0.017986,-


## See rest of the pipeline in PSSM_train.ipynb, which will resume from loading in these .pkl files (to save time having to re-process the dataframes each time the notebook restarts)