# Cross validation from serialized dataset

> Faster than HMM_cross_val.py and PSSM_cross_val.py because it does not require preprocessing functions

> I am using sklearn here 

Qsub parameters

In [1]:
#$ -pe smp 16
#$ -jc short
#$ -adds l_hard gpu 4
#$ -mods l_hard mfree 16G
#$ -m ea
#$ -M 2472402@dundee.ac.uk
#$ -wd /cluster/gjb_lab/2472402/outputs/2021-09-03

Import modules

In [123]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import neural_network
from sklearn.preprocessing import StandardScaler

Load serialized datasets
> Serialized datasets are obtained from SNNS hmm1.pat and pssm1.pat pattern files

> These pattern files are in turn obtained from running train_network.pl (commenting out the last line train_net())

In [24]:
DATA='/cluster/gjb_lab/2472402/data/retr231/'
HMM=DATA+'hmm1.pkl'
PSSM=DATA+'pssm1.pkl'
DSSP=DATA+'dssp-vec.pkl'

In [75]:
with open(HMM,'rb') as f:
    X1=pickle.load(f)
with open(PSSM,'rb') as f:
    X2=pickle.load(f)
with open(DSSP,'rb') as f:
    y=pd.DataFrame(pickle.load(f))

Add seqID column for grouping by seqID during train-test split

In [79]:
X1.loc[:,'seqID'] = [c.split('_')[0] for c in X1.index]
X2.loc[:,'seqID'] = [c.split('_')[0] for c in X2.index]
y.loc[:,'seqID'] = [c.split('_')[0] for c in y.index]

Function to load cross val splits from resume.log file, which is an output of scripts/best_shuffle_by_cutoff.pl

In [41]:
# obtain 7 sets of indices from resume.log, which is an output of shuffle.pl
def get_splits(resume_log_file):
    val_splits = []
    set_idx = -1
    cur_set = set() 
    with open(resume_log_file,'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            if line.startswith('#SET'):
                if set_idx > -1:
                    val_splits.append(cur_set)
                    cur_set = set()
                set_idx += 1
            else:
                seqID = line.split('/')[-1].replace('.pssm','')
                cur_set.add(seqID)
        # append last set which is not followed by another line '#SET...'
        val_splits.append(cur_set)
    assert sum([len(s) for s in val_splits])==1348
    return val_splits

Define cross validation routine

In [127]:
def run_CV(X,y,**params):
    
    global DEBUG
    
    clfs = []
    
    kf = get_splits('/cluster/gjb_lab/2472402/data/retr231_shuffles/shuffle02/best_shuffle_th_1.log')
    
    for k in range(7):
        
        # convert sets into lists
        valid_idx = list(kf[k])
        train_idx = (set().union(*(split_sets[0:k] + split_sets[k+1:])))

        # split data
        X_train = X1[X1['seqID'].isin(train_idx)].drop(['seqID'],axis=1)
        X_valid = X1[X1['seqID'].isin(valid_idx)].drop(['seqID'],axis=1)
        y_train = y[y['seqID'].isin(train_idx)].drop(['seqID'],axis=1)
        y_valid = y[y['seqID'].isin(valid_idx)].drop(['seqID'],axis=1)
        
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_valid = scaler.transform(X_valid)
        
        # convert y into array
        y_train = y_train.loc[:,0].ravel()
        y_valid = y_valid.loc[:,0].ravel()
        
        
        if DEBUG:
            X_train = X_train[::100]
            X_valid = X_valid[::10]
            y_train = y_train[::100]
            y_valid = y_valid[::10]

        # generate classifier        
        clf = neural_network.MLPClassifier(**params)
                
        # train classifier
        clf.fit(X_train,y_train)

        # evaluate model
        print("Accuracy: ", clf.score(X_valid,y_valid))
        
        clfs.append(clf)

    return clfs

Run training

In [126]:
clfs_list = []

In [129]:
params = {
    'alpha' : 1e-5,
    'solver' : 'sgd',
    'learning_rate' : 'constant',
    'momentum' : 0.9,
    'nesterovs_momentum' : True,
    'max_iter' : 1000,
    'shuffle' : True,
    'activation' : 'logistic',
}
clfs = run_CV(X1,y,**params)
clfs_list.append(clfs)



Accuracy:  0.7094918504314478




Accuracy:  0.6987301587301588




Accuracy:  0.7149805447470817




Accuracy:  0.6952789699570815




Accuracy:  0.6896887159533074




Accuracy:  0.7194132334581773
Accuracy:  0.6923601637107776




In [133]:
params = {
    'alpha' : 1e-5,
    'solver' : 'adam',
    'learning_rate' : 'constant',
    'momentum' : 0.9,
    'nesterovs_momentum' : True,
    'max_iter' : 1000,
    'shuffle' : True,
    'activation' : 'logistic',
}
clfs = run_CV(X1,y,**params)
clfs_list.append(clfs)

Accuracy:  0.696708213486737
Accuracy:  0.6961904761904761
Accuracy:  0.711413748378729
Accuracy:  0.6928264868179032
Accuracy:  0.6913099870298314
Accuracy:  0.7147315855181023
Accuracy:  0.6988403819918144


In [135]:
params = {
    'alpha' : 1e-5,
    'solver' : 'adam',
    'learning_rate' : 'invscaling',
    'momentum' : 0.9,
    'nesterovs_momentum' : True,
    'max_iter' : 1000,
    'shuffle' : True,
    'activation' : 'logistic',
    'hidden_layer_sizes' : (100,)
}
clfs = run_CV(X1,y,**params)
clfs_list.append(clfs)

Accuracy:  0.7002237136465325
Accuracy:  0.7012698412698413
Accuracy:  0.711413748378729
Accuracy:  0.6958920907418762
Accuracy:  0.6932555123216602
Accuracy:  0.712234706616729
Accuracy:  0.694406548431105


In [136]:
params = {
    'alpha' : 1e-3,
    'solver' : 'adam',
    'learning_rate' : 'invscaling',
    'momentum' : 0.9,
    'nesterovs_momentum' : True,
    'max_iter' : 1000,
    'shuffle' : True,
    'activation' : 'logistic',
    'hidden_layer_sizes' : (100,)
}
clfs = run_CV(X1,y,**params)
clfs_list.append(clfs)

Accuracy:  0.6986257590284436
Accuracy:  0.699047619047619
Accuracy:  0.7153047989623865
Accuracy:  0.6888412017167382
Accuracy:  0.6906614785992218
Accuracy:  0.7131710362047441
Accuracy:  0.7066848567530696
