# Read, split, and process training data for cross-validation

The purpose of this notebook is to parse training profiles as seven separate subsets for cross-validation

Import core modules

In [1]:
import pandas as pd
import numpy as np
import math
import os
import pickle

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


Specify directory of dataset for training and testing
Assumes that current working directory is '/homes/2472402/'

In [21]:
root_dir = '/cluster/gjb_lab/2472402/retr231/training/'
train_dirs = [train_dir + str(i) + '/' for i in range(1,8)]

# nested list of file names with .fasta extension
file_names = [os.listdir(train_dir) for train_dir in train_dirs]

# remove .fasta extension to leave us with just the sequence IDs
# do this with nested lsit comprehension
seq_names = [[f[:-6] for f in file_name] for file_name in file_names]

Check that across the subsets, all names are unique, there are no repetitions

In [60]:
# step 1 is to pool all sequences into a single list
seq_names_pooled = [seq for sublist in seq_names for seq in sublist]

# convert to series
seq_names_pooled_series = pd.Series(seq_names_pooled)

# check that number of unique elements is equal to total number of elements
# if true then pass
seq_names_pooled_series.nunique() == len(seq_names_pooled)

True

Convert the seq names to a dictionary

In [83]:
cross_validation_subsets = {k:v for k,v in zip(list(range(1,8)), seq_names)}

# save dictionary
with open('cross_val_dict.pkl', 'wb') as fp:
    pickle.dump(cross_validation_subsets, fp, protocol=pickle.HIGHEST_PROTOCOL)

Load in .profile files and split them based on cross validation dictionary

In [90]:
# create nested list of .profile file names to read
profile_names = [[seq + '.profile' for seq in sublist] for sublist in seq_names ]

profile_dir = '/cluster/gjb_lab/2472402/retr231/training/' # path to training dataset

# create nested list of profiles, stored as dataframes
# profiles_list = pd.read_csv()


# Parse profiles as nested list

Use the code from PSSM_parse.ipynb to parse in profiles in nested list, rather than a simple list

In [93]:
# read all profiles (files with the .profile extension), collate into a list
profiles_list = [[pd.read_csv(profile_dir + pn) for pn in sublist] for sublist in profile_names]

# might want to run this in future if want to keep track of sequence_ID
# profiles = [pn[:-8]: pd.read_csv(train_dir + pn) for pn in profile_names]

# store column names of PSSM matrix as a constant
column_names = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

def logistic_func(x):
    x = int(x) # cast string to int
    return 1/(1+math.exp(-1*x)) # simple case of logistic function

def process_profile(pf):
    # for each list element, take only the pssm column
    pf = pf['Last position-specific scoring matrix computed'] 
    # for each list element, remove the last 5 rows from each pssm profile, i.e. k and lambda metrics
    pf = pf[:-5]
    # for each list element, split each row using default whitespace delimiter. take only elements 0 - 20, i.e. the 20 amino acid residues
    pf = pd.DataFrame([row.split() for row in pf])
    # obtain sequence in list form - this will become the row names
    index = pf.iloc[1:,1].tolist()
    # trim away unneeded portion of dataframe, including numerical index, and cols 23 onwards
    pf = pf.iloc[1:, 2:22]
    # assign column names (the same for all PSSM profiles)
    pf.columns = column_names
    # assign row names of pf
    pf.index = index
    return pf

# perform these steps at one go, one profile at a time
profiles_list = [[process_profile(pf) for pf in sublist] for sublist in profiles_list]

Squash using logistic regression

In [97]:
pf_list = [[pf.applymap(logistic_func) for pf in sublist] for sublist in profiles_list]

Label with dssp truths

In [160]:
# load DSSP information for that sequence
dssps = [[pd.read_csv(profile_dir + sequence_ID + '.dssp') for sequence_ID in sublist] for sublist in seq_names]

# extract out the string from the dataframe
dssps = [[dssp.iloc[0,0] for dssp in sublist] for sublist in dssps]

# convert string into list of 'H', 'E' or '-' characters, 1 for each residue
dssps = [[[char for char in dssp] for dssp in sublist] for sublist in dssps]

def add_dssp(profile, dssp):
    profile['dssp'] = dssp
    return profile

labelled_pfs = [[add_dssp(pf, dssp) for (pf, dssp) in zip(pf_list_sub, dssps_sub)] for (pf_list_sub, dssps_sub) in zip(pf_list, dssps)]

#see an example
labelled_pfs[4][192]

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
M,0.268941,0.268941,0.119203,0.047426,0.119203,0.500000,0.119203,0.047426,0.119203,0.731059,...,0.268941,0.997527,0.500000,0.047426,0.119203,0.268941,0.119203,0.268941,0.731059,-
P,0.268941,0.047426,0.047426,0.119203,0.017986,0.119203,0.119203,0.047426,0.047426,0.047426,...,0.268941,0.047426,0.017986,0.999877,0.268941,0.119203,0.006693,0.017986,0.047426,-
L,0.268941,0.047426,0.017986,0.006693,0.119203,0.047426,0.017986,0.006693,0.017986,0.880797,...,0.047426,0.731059,0.268941,0.017986,0.047426,0.268941,0.047426,0.119203,0.993307,-
P,0.119203,0.047426,0.047426,0.047426,0.017986,0.119203,0.119203,0.047426,0.047426,0.017986,...,0.119203,0.047426,0.006693,0.999665,0.268941,0.880797,0.006693,0.017986,0.047426,-
A,0.993307,0.047426,0.119203,0.731059,0.119203,0.119203,0.119203,0.119203,0.047426,0.268941,...,0.119203,0.119203,0.017986,0.047426,0.731059,0.731059,0.006693,0.047426,0.952574,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D,0.017986,0.047426,0.119203,0.997527,0.002473,0.268941,0.997527,0.017986,0.047426,0.002473,...,0.119203,0.006693,0.002473,0.017986,0.119203,0.047426,0.002473,0.006693,0.006693,H
R,0.047426,0.999089,0.119203,0.017986,0.002473,0.952574,0.119203,0.017986,0.500000,0.002473,...,0.952574,0.017986,0.006693,0.017986,0.268941,0.047426,0.006693,0.017986,0.006693,-
G,0.952574,0.047426,0.119203,0.047426,0.047426,0.047426,0.047426,0.993307,0.047426,0.731059,...,0.047426,0.047426,0.017986,0.047426,0.268941,0.119203,0.017986,0.017986,0.119203,-
A,0.993307,0.119203,0.119203,0.047426,0.268941,0.119203,0.119203,0.268941,0.047426,0.268941,...,0.119203,0.268941,0.047426,0.119203,0.731059,0.268941,0.017986,0.047426,0.880797,-


Save as pickle object (in case we want to use it for 2D CNN)

In [185]:
with open('/homes/2472402/data/pssm-sdx.pkl', 'wb') as fp:
    pickle.dump(labelled_pfs, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
# s = squashed, d = labelled with dssp information, x = split into groups for cross-validation, w = windowed

## Sliding window operation

Define some functions for the operation

In [163]:
# function to add padding of #FLANK rows to the top and bottom of dataframe. flank = 8 for layer 1 (seq to struct) and flank = 9 for layer 2 (struct to struct)
def padding_operation(profile, flank):
    # create a numpy array with 21 NaNs
    pad_np_array = np.hstack((np.repeat(np.nan,21)))
    # convert numpy array into pd Series so we can create dataframe based on it
    pad_row = pd.Series(pad_np_array)
    # create empty dataframe and add the NaN rows to it
    pad_df = pd.DataFrame()
    for _ in range(flank):
        pad_df = pad_df.append(pad_row, ignore_index=True)
    # assign column names to this padding df - same column names as ppsm matrix
    pad_df.columns = profile.columns
    # create a copy of profile which i will now modify
    pf = profile.copy()
    # reset index names from pf, otherwise pd.concat (next line) will throw error saying index has to be nonredundant
    pf.index = range(0,len(pf.index))
    # add padding to above and below dataframes
    padded_pf = pd.concat([pad_df, pf, pad_df], ignore_index=True)
    return padded_pf

# function returns a list of windows (variable name patterns). flank argument is same as above. takes in the padded_df from padding_operation()
def sliding_operation(padded_pf, flank):
    # Using iloc based indexing. i.e. 'location based indexing' cf label based indexing
    # idx of first amino acid of sequence in padded dataframe 
    seqStartIdx = flank 
    # idx of last amino acid of sequence in padded dataframe
    seqEndIdx = padded_pf.shape[0]-flank
    # create a list of dataframes which we will put individual windows into. this list of patterns should span across samples
    patterns = []
    for seqIdx in range(seqStartIdx, seqEndIdx): # for each amino acid sequence / centroid of window
        # define start and end index of window given position of center residue
        winStartIdx = seqIdx - 8
        winEndIdx = seqIdx + 8 + 1
        # again using location based indexing with .iloc[]
        pattern = padded_pf.iloc[winStartIdx:winEndIdx, :]
        patterns.append(pattern)
    return patterns

# Wrapper function that generates N patterns from an unpadded profile
def get_patterns(profile):
    # add padding
    padded_pf = padding_operation(profile, flank = 8)
    # genereate windows
    patterns = sliding_operation(padded_pf, flank = 8)
    return(patterns)

Generate patterns for each profile in each training subset, then collapse all the patterns in each training subset

In [172]:
# get a doubly nested list consisting of 7 elements, each element containing 193 subelements that represent individual profiles.  
# these subelements are themselves lists, each containing N profiles where N is sequence length/number of rows in that profile
pats_nested = [[get_patterns(pf) for pf in pfs] for pfs in labelled_pfs]
pats_collapsed = [[pat for pats_sub in pats for pat in pats_sub] for pats in pats_nested]

Save patterns as pkl object

In [187]:
with open('/homes/2472402/data/pssm-sdxw.pkl', 'wb') as fp:
    pickle.dump(pats_collapsed, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
# s = squashed, d = labelled with dssp information, x = split into groups for cross-validation, w = windowed

# Linearize patterns

Define functions to linearize

In [189]:
# function to collapse rows in df into a single row dataframe. 
# this is a inner function, it should not be applied on patterns containing dssp information
def _linearize(df):
    ndf = df.T.unstack().to_frame().T
    ndf.columns = ndf.columns.swaplevel()
    return ndf

# wrapper function of _linearize
# apply this on patterns containing dssp information
# it will return a single-row df with dssp information as the last column
def linearize(labelled_pat):
    dssp_col = labelled_pat.iloc[:,-1].tolist() # store dssp column as a list
    centroid_idx = int(len(dssp_col)/2) # index of centre of window
    target = dssp_col[centroid_idx] # store prediction target. will be H, E, or -
    unlabelled_pat = labelled_pat.iloc[:, :-1] # remove dssp column
    unlabelled_pat.reset_index(drop=True, inplace = True) # reset index so that column name will be same across all patterns
    linearized_pat = _linearize(unlabelled_pat) # convert unlabelled pattern into a single-row df
    linearized_pat['dssp'] = target # assign additional column containing prediction target to the single row df
    return linearized_pat

In [190]:
linear_dfs = [[linearize(pat) for pat in pats] for pats in pats_collapsed]

In [191]:
[len(x) for x in linear_dfs]

[29447, 32902, 29968, 30528, 33242, 33184, 29150]

In [192]:
linear_dfs_concat = [pd.concat(dfs, ignore_index=True) for dfs in linear_dfs]

Save patterns as pkl object

In [196]:
# s = squashed, d = labelled with dssp information, x = split into groups for cross-validation, w = windowed, l = linearized
with open('/homes/2472402/data/pssm-sdxwl.pkl', 'wb') as fp:
    pickle.dump(linear_dfs_concat, fp, protocol=pickle.HIGHEST_PROTOCOL)

# Do one-hot encoding of DSSP information

In [74]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [75]:
with open('/homes/2472402/data/pssm-sdxwl.pkl', 'rb') as fp: # list of 7 giant dataframes
    Xs = pickle.load(fp)
ys = [X.dssp for X in Xs] # obtain 'dssp' column from each of the 7 dfs
Xs = [X.drop(columns='dssp') for X in Xs] # drop 'dssp' column from each df

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [76]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
ys = [pd.DataFrame(enc.fit_transform(y.values.reshape(-1,1))) for y in ys]

Deal with NaNs in edge cases by replacing them with 0

In [77]:
imp = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 0) # NaN with 0, a number that virtually does not occur in the squashed matrix
Xs = [pd.DataFrame(imp.fit_transform(X)) for X in Xs]

# Save imputed data

In [196]:
# s = squashed, d = labelled with dssp information, x = split into groups for cross-validation, w = windowed, l = linearized, i = imputed
with open('/homes/2472402/data/pssm-sdxwli.pkl', 'wb') as fp:
    pickle.dump([Xs,ys], fp, protocol=pickle.HIGHEST_PROTOCOL)

# Do cross validation on split sets

In [2]:
# write function to generate cross validation indices
# custom implementation of stratifiedkfold from sklearn.model_selection
# by altering the upper bound for
def train_test_split(folds):
    d_list = [] # list of dictionaries to return at end of func
    for test_idx in range(0,folds):
        all_idx = list(range(0,folds))
        train_idx = [idx for idx in all_idx if idx != test_idx]
        d = {'test_idx': test_idx, 'train_idx' : train_idx} # d is a dictionary of test and train indices
        d_list.append(d)
    return d_list 


# this function is copied from HMM.ipynb
# array: numpy array
# flank: positive integer
def sliding_window(array, flank):
    assert flank > 0
    assert type(array) is np.ndarray
    assert np.logical_not(np.isnan(np.sum(array)))
    nrow = array.shape[0]
    assert nrow > 0
    ncol = array.shape[1]
    assert ncol > 0
    res = np.empty(shape=(nrow, (2*flank+1)*ncol))
    res[:] = np.nan
    for i in list(range(0,nrow)):
        s, e = i-flank, i+flank+1
        k = 0;
        for j in list(range(s,e)):
            if (j < 0 or j >= nrow):
                res[i, k:k+ncol] = 0
            else:
                assert np.logical_not(np.isnan(np.sum(array[j])))
                assert array[j].shape == (ncol,)
                res[i, k:k+ncol] = array[j]
            k += ncol
    assert np.logical_not(np.isnan(np.sum(res)))
    assert res.shape == (nrow, (2*flank+1)*ncol)
    return res

# this function rounds predictions into 1 and 0s
def argmax(arr):
    n, c = arr.shape
    assert c == 3
    assert type(arr) is np.ndarray
    assert np.logical_not(np.isnan(np.sum(arr)))
    res = np.empty(shape=(n,c))
    res[:] = np.nan
    for i in list(range(0,n)):
        max_idx = np.argmax(arr[i])
        if max_idx == 0:
            res[i] = np.array([1, 0, 0])
        elif max_idx == 1:
            res[i] = np.array([0, 1, 0])
        else:
            assert max_idx == 2
            res[i] = np.array([0, 0, 1])
    assert np.logical_not(np.isnan(np.sum(res)))
    return res

In [3]:
# import modules
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
from os import path

# first make sure that output path is valid, otherwise computation will go to waste
out_path = '/cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/'
assert path.exists(out_path)
assert out_path[-1] == '/'

# import training data
X = pickle.load(open('/cluster/gjb_lab/2472402/data/cross-val/pssm-sdxwli-seqs-X.pkl','rb'))
Y = pickle.load(open('/cluster/gjb_lab/2472402/data/cross-val/pssm-sdxwli-seqs-y.pkl','rb'))

# comment out these lines after dry run. 
X = [x[0:2] for x in X]
Y = [y[0:2] for y in Y]

# counter for cross validation
counter = 1

# for each fold in the 7 fold cross validation procedure, do...
for current_split in train_test_split(folds=7): 
    
    print('Commencing fold %d of cross validation at %s'%(counter, datetime.now().strftime("%D %H:%M:%S")))
    
    # get which set will be used for validation (test_idx), the remaining 6 will be usede for training (train_idx)
    train_idx = current_split['train_idx']
    test_idx = current_split['test_idx']
    
    # obtain test pssm profile (X) and dssp information (y)
    X_test = np.vstack(X[test_idx])
    Y_test = np.vstack(Y[test_idx])

    assert X_test.dtype=='float64'
    assert Y_test.dtype=='float64'
    
    assert X_test.shape[0] == Y_test.shape[0]
    assert X_test.shape[1] == 340
    assert Y_test.shape[1] == 3

    # obtain train pssm profile (X) and dssp information (Y)
    X_train = np.concatenate(tuple([X[idx] for idx in train_idx]), dtype=object)
    Y_train = np.concatenate(tuple([Y[idx] for idx in train_idx]), dtype=object)
    
    X_train_stacked = np.concatenate(X_train) # dtype=float64
    Y_train_stacked = np.concatenate(Y_train)
    
    assert X_train_stacked.dtype=='float64'
    assert Y_train_stacked.dtype=='float64'
    
    assert X_train_stacked.shape[0] == Y_train_stacked.shape[0]
    assert X_train_stacked.shape[1] == 340
    assert Y_train_stacked.shape[1] == 3
    
    # sequence to structure layer
    model1 = keras.Sequential([
        layers.Dense(units=20, activation='sigmoid', input_shape=[340]),
        layers.Dense(units=3, activation='softmax')
    ])

    model1.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

    print('Fitting layer 1 model. %s' % datetime.now().strftime("%H:%M:%S"))
    
    history1 = model1.fit(X_train_stacked, Y_train_stacked,
                          validation_data=(X_test,Y_test),
                          batch_size=128,
                          epochs=300, 
                          verbose=0)
    
    print('Calculating layer 1 predictions. %s' % datetime.now().strftime("%H:%M:%S"))
    
    # obtain layer 1 predictions and simplify it with argmax
    Y_pred = [argmax(model1.predict(X)) for X in X_train]
    
    # convert layer 1 Y output into layer 2 X input
    X_train_2 = [sliding_window(Y, flank=9) for Y in Y_pred]
    X_train_2_stacked = np.concatenate(tuple(X_train_2))
    
    # structure to structure layer
    model2 = keras.Sequential([
        layers.Dense(units=20, activation='sigmoid', input_shape=[57]), 
        layers.Dense(units=3, activation = 'softmax')
    ])
    
    model2.compile(
        optimizer='sgd', 
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # feed X_test through layer 1 because we are testing it on layer 2
    X_test = model1.predict(X_test)
    X_test = sliding_window(X_test, flank=9)
    
    print('Fitting layer 2 model. %s' % datetime.now().strftime("%H:%M:%S"))

    history2 = model2.fit(
        X_train_2_stacked, Y_train_stacked, # note y_train is unchanged
        validation_data=(X_test, Y_test), # this time include test data
        batch_size=128, 
        epochs=300, 
        verbose=0, 
    )
    
    # save results 
    print('Saving results to %s' % out_path)

    history = [pd.DataFrame(history1.history), pd.DataFrame(history2.history)]
    pickle.dump(history, open(out_path + 'results_%d.pkl' % counter, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

    model1.save(out_path + 'fold%d_model1' % counter, save_format = 'tf') # tensorflow SavedModel format
    model2.save(out_path + 'fold%d_model2' % counter, save_format = 'tf')

    # finish current fold
    print('Finished fold %d of cross validation at %s\n' % (counter, datetime.now().strftime("%D %H:%M:%S")))
    
    # increment counter and continue
    counter += 1

Commencing fold 1 of cross validation at 08/06/21 14:31:49
Fitting layer 1 model. 14:31:49




Calculating layer 1 predictions. 14:32:38
Fitting layer 2 model. 14:32:41
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold1_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold1_model2/assets
Finished fold 1 of cross validation at 08/06/21 14:33:30

Commencing fold 2 of cross validation at 08/06/21 14:33:30
Fitting layer 1 model. 14:33:30




Calculating layer 1 predictions. 14:34:16
Fitting layer 2 model. 14:34:19
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold2_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold2_model2/assets
Finished fold 2 of cross validation at 08/06/21 14:35:03

Commencing fold 3 of cross validation at 08/06/21 14:35:03
Fitting layer 1 model. 14:35:03




Calculating layer 1 predictions. 14:35:50
Fitting layer 2 model. 14:35:52
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold3_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold3_model2/assets
Finished fold 3 of cross validation at 08/06/21 14:36:41

Commencing fold 4 of cross validation at 08/06/21 14:36:41
Fitting layer 1 model. 14:36:41




Calculating layer 1 predictions. 14:37:29
Fitting layer 2 model. 14:37:32
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold4_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold4_model2/assets
Finished fold 4 of cross validation at 08/06/21 14:38:20

Commencing fold 5 of cross validation at 08/06/21 14:38:20
Fitting layer 1 model. 14:38:20




Calculating layer 1 predictions. 14:39:07
Fitting layer 2 model. 14:39:10
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold5_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold5_model2/assets
Finished fold 5 of cross validation at 08/06/21 14:39:59

Commencing fold 6 of cross validation at 08/06/21 14:39:59
Fitting layer 1 model. 14:39:59




Calculating layer 1 predictions. 14:40:47
Fitting layer 2 model. 14:40:50
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold6_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold6_model2/assets
Finished fold 6 of cross validation at 08/06/21 14:41:39

Commencing fold 7 of cross validation at 08/06/21 14:41:39
Fitting layer 1 model. 14:41:39




Calculating layer 1 predictions. 14:42:27
Fitting layer 2 model. 14:42:30
Saving results to /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold7_model1/assets
INFO:tensorflow:Assets written to: /cluster/gjb_lab/2472402/outputs/pssm_cross_val/tmp/fold7_model2/assets
Finished fold 7 of cross validation at 08/06/21 14:43:17



In [117]:
acc = [df.accuracy for df in results]
print("%.2f%% (+/- %.2f%%)" % (np.mean(acc), np.std(acc)))

0.80% (+/- 0.12%)
