# Get SOV and accuracy predictions from SNNS and keras cross-validation output

Modified version of sov-and-accuracy.ipynb to sync with output of wrapper.pl ran on 25 and 26 Aug 2021. Naming of files is based on seqIDs instead of SCOPe domain names.

27 Aug - fix bug in the keras side where H and E are switched. Error was in one of the lambda functions

# Shared functions

In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import subprocess

In [3]:
# Write function calc() that outputs:
# 1. HMM Q3 accuracy (N matches/N total)
# 2. PSSM Q3 accuracy
# 3. HMM SOV (call calSOV binary from python) 
# 4. PSSM SOV
# 
# Given the following arguments
# 1. sequence
# 2. dssp
# 3. hmm prediction
# 4. pssm prediction

def calc(outfolder,scopeID,seq,dssp,hmm_pred,pssm_pred): 
    n=len(seq)
    assert n!=0
    assert all(_==n for _ in [len(dssp),len(hmm_pred),len(pssm_pred)])
    hmm_pred=hmm_pred.replace('-','C')
    pssm_pred=pssm_pred.replace('-','C')
    dssp=dssp.replace('-','C')
    # write two sov.in file
    hmm_sov_in=os.path.join(outfolder,'%s.hmm.sov.in' % scopeID)
    pssm_sov_in=os.path.join(outfolder,'%s.pssm.sov.in' % scopeID)
    with open(hmm_sov_in,'w+') as f_hmm, open(pssm_sov_in,'w+') as f_pssm:
        f_hmm.write("AA  OSEC PSEC\n")
        f_pssm.write("AA  OSEC PSEC\n")
        for (aa,osec,psec_hmm,psec_pssm) in zip(seq,dssp,hmm_pred,pssm_pred): # no commas allowed
            f_hmm.write("%s   %s    %s\n" % (aa,osec,psec_hmm))
            f_pssm.write("%s   %s    %s\n" % (aa,osec,psec_pssm))

    hmm_sov_out=os.path.join(outfolder,'%s.hmm.sov.out' % scopeID)
    pssm_sov_out=os.path.join(outfolder,'%s.pssm.sov.out' % scopeID)

    cmd_hmm="calSOV -f 1 %s -o %s" % (hmm_sov_in,hmm_sov_out) # must have calSOV installed and path configured
    cmd_pssm="calSOV -f 1 %s -o %s" % (pssm_sov_in,pssm_sov_out)
    proc_hmm=subprocess.run(args=cmd_hmm.split())
    proc_pssm=subprocess.run(args=cmd_pssm.split())

    assert proc_hmm.returncode==0
    assert proc_pssm.returncode==0
    assert os.path.exists(hmm_sov_out)
    assert os.path.exists(pssm_sov_out)

    with open(hmm_sov_out,'r') as f:
        lines=f.read().splitlines()
        ans=lines[1].split()
        hmm_sov,hmm_acc=ans[1],ans[10]
        
    assert os.path.exists(pssm_sov_out)
    
    with open(pssm_sov_out,'r') as f:
        lines=f.read().splitlines()
        ans=lines[1].split()
        pssm_sov,pssm_acc=ans[1],ans[10]
    
    assert(_!="nan" & _!="-nan" for _ in [hmm_sov,hmm_acc,pssm_sov,pssm_acc])    
    
#    del_cmd='rm %s %s %s %s' % (hmm_sov_in, hmm_sov_out, pssm_sov_in, pssm_sov_out)
#    proc_del=subprocess.run(args=del_cmd.split())
#    assert proc_del.returncode==0    
    
    return tuple([float(v) for v in (hmm_acc,pssm_acc,hmm_sov,pssm_sov)])

# Keras

In [7]:
debug=False
irange=range(1,8)
expt_name='20Sep_2'

## WARNING: THIS DOES NOT USE PSSMb NEURAL NET - NEEDS TO BE UPDATED TO LOAD RESULTS FROM .../PSSMb/...

In [15]:
for i in irange:

    # read hmm predictions
    hmm_preds_pkl="/cluster/gjb_lab/2472402/outputs/keras_train_CV/HMM/%s/cross-val%d/results.pkl" % (expt_name,i)
    assert os.path.exists(hmm_preds_pkl)
    with open(hmm_preds_pkl,'rb') as f:
        (_,(seqIDs,dssp_list,_,hmm_preds))=pickle.load(f)
    
    # read pssm predictions
    pssm_preds_pkl="/cluster/gjb_lab/2472402/outputs/keras_train_CV/PSSM/%s/cross-val%d/results.pkl" % (expt_name,i)    
    assert os.path.exists(pssma_preds_pkl)
    with open(pssma_preds_pkl,'rb') as f:
        (_,(seqIDs2,dssp_list2,_,pssm_preds_pkl))=pickle.load(f)

    assert (((seqIDs[i]==seqIDs2[i]).all() for i in range(len(seqIDs))))
    assert (((seqIDs[i]==seqIDs3[i]).all() for i in range(len(seqIDs))))
    
    results=[]
    alignments=[]
    
    for (seqID,dssp_vector,hmm_probabilities,pssm_probabilities) in zip(seqIDs,dssp_list,hmm_preds,pssm_preds):

        # f: [0.005 0.005 0.98] -> 2, [0.3,0.4,0.3] -> 1, [0.8, 0.1, 0.1] -> 0
        # g: [2,1,0] -> ['-','E','H']
        # h: ['H','H','E,'C','H'] -> 'HHE-H'
        f=lambda arg: np.apply_along_axis(np.argmax,1,arg) 
        g=lambda args: [['E','H','-'][arg] for arg in args] 
        h=lambda arg: ''.join(arg) 
        
        # convert probability distribution into prediction string
        hmm_pred=h(g(f(hmm_probabilities)))
        # take average of pssma and pssmb predictions
        pssm_pred=h(g(f(pssm_probabilities)))
        dssp=h(g(f(dssp_vector)))
        datadir='/homes/adrozdetskiy/Projects/jpredJnet231ReTrainingSummaryTable/scores/training'
        fasta_file=os.path.join(datadir,'%s.fasta' % seqID)
        assert os.path.exists(fasta_file)
        with open(fasta_file,'r') as f:
            sec=f.read().splitlines()
            seq=sec[1]
        
        # check sanity of input to calc()
        if debug:
            print("seqID:   %s" % seqID)
            print("DSSP:      %s" % dssp)
            print("HMM pred:  %s" % hmm_pred)
            print("PSSM pred: %s" % pssm_pred)

        root_folder='/cluster/gjb_lab/2472402/sov/keras/%s/' % expt_name
        if not os.path.exists(root_folder):
            os.system("mkdir %s" % root_folder)
        sov_folder=os.path.join(root_folder, 'cross-val%d' % i)
        if not os.path.exists(sov_folder):
            os.system("mkdir %s" % sov_folder)

        (hmm_acc,pssm_acc,hmm_sov,pssm_sov)=calc(sov_folder,seqID,seq,dssp,hmm_pred,pssm_pred)

        # check sanity of output of calc()
        if debug:
            print("output of calc: ")
            print("HMM acc:   %s" % hmm_acc)
            print("PSSM acc:  %s" % pssm_acc)
            print("HMM SOV:   %s" % hmm_sov)
            print("PSSM SOV:  %s" % pssm_sov)

        #store results
        results.append((seqID,hmm_acc,pssm_acc,hmm_sov,pssm_sov))
        alignments.append((seqID,seq,dssp,hmm_pred,pssm_pred))


    # output results to csv file
    out_folder='/cluster/gjb_lab/2472402/results/keras/%s' % expt_name
    if not os.path.exists(out_folder):
        os.system("mkdir %s" % out_folder)
    out_csv='cv%i_scores.csv' % i
    with open(os.path.join(out_folder,out_csv),'w+') as f:
        f.write("seqID,HMM_acc,PSSM_acc,HMM_sov,PSSM_sov\n")
        for (seqID,hmm_acc,pssm_acc,hmm_sov,pssm_sov) in results:
            line="%s,%.2f,%.2f,%.2f,%.2f\n" % (seqID,hmm_acc,pssm_acc,hmm_sov,pssm_sov)
            f.write(line)

    # write alignments to kerasnet file
    out_knet='cv%i.knet' % i
    with open(os.path.join(out_folder,out_knet),'w+') as f:
        for (seqID, seq,dssp,hmm_pred,pssm_pred) in alignments:
            f.write("seqID     : %s\n" % seqID)
            f.write("sequence  : %s\n" % seq)
            f.write("DSSP      : %s\n" % dssp)
            f.write("HMM_pred  : %s\n" % hmm_pred)
            f.write("PSSM_pred : %s\n" % pssm_pred)

TypeError: unsupported operand type(s) for //: 'list' and 'int'

# SNNS

In [9]:
debug=False
irange=range(1,8)
expt_name='25Aug'
root_folder="/cluster/gjb_lab/2472402/snns_cross_val_25_Aug/"

In [10]:
for i in irange:

    # read SCOPe domain names into a list
    cross_val_folder=os.path.join(root_folder, 'cross-val%d' % i)
    
    results=[]
    alignments=[]
    
    seqIDs=sorted(_[:-5] for _ in os.listdir(cross_val_folder) if _[-5:]=='.jnet')
    
    for seqID in seqIDs:
        
        jnet_path=os.path.join(cross_val_folder,seqID+'.jnet')
        with open(jnet_path,'r') as f:
            lines=f.read().splitlines()
            hmm_pred=lines[6].replace("JNETHMM:","").replace(",","")
            pssm_pred=lines[7].replace("JNETPSSM:","").replace(",","")
        
        data_folder="/homes/adrozdetskiy/Projects/jpredJnet231ReTrainingSummaryTable/scores/training/"
        dssp_file=os.path.join(data_folder,seqID+'.dssp')
        fasta_file=os.path.join(data_folder,seqID+'.fasta')

        assert os.path.exists(dssp_file)
        with open(dssp_file,'r') as f:
            dssp=f.read().rstrip()
        
        assert os.path.exists(fasta_file)
        with open(fasta_file, 'r') as f:
            seq=f.read().splitlines()[1]
        
        # check sanity of input to calc()
        if debug:
            print("scopeID:   %s" % seqID)
            print("DSSP:      %s" % dssp)
            print("HMM pred:  %s" % hmm_pred)
            print("PSSM pred: %s" % pssm_pred)
        
        sov_root='/cluster/gjb_lab/2472402/sov/snns/%s' % expt_name
        if not os.path.exists(sov_root):
            os.system("mkdir %s" % sov_root)
        sov_folder=os.path.join(sov_root,'cross-val%d' % i)
        if not os.path.exists(sov_folder):
            os.system("mkdir %s" % sov_folder)
        (hmm_acc,pssm_acc,hmm_sov,pssm_sov)=calc(sov_folder,seqID,seq,dssp,hmm_pred,pssm_pred)

        # check sanity of output of calc()
        if debug:
            print("output of calc: ")
            print("HMM acc:   %s" % hmm_acc)
            print("PSSM acc:  %s" % pssm_acc)
            print("HMM SOV:   %s" % hmm_sov)
            print("PSSM SOV:  %s" % pssm_sov)

        #store results
        results.append((seqID,hmm_acc,pssm_acc,hmm_sov,pssm_sov))
        alignments.append((seqID,seq,dssp,hmm_pred,pssm_pred))


    # output results to csv file
    out_folder="/cluster/gjb_lab/2472402/results/snns/%s" % expt_name
    if not os.path.exists(out_folder):
        os.system("mkdir %s" % out_folder)
    out_csv="cv%i_scores.csv" % i
    with open(os.path.join(out_folder,out_csv),'w+') as f:
        f.write("seqID,HMM_acc,PSSM_acc,HMM_sov,PSSM_sov\n")
        for (seqID,hmm_acc,pssm_acc,hmm_sov,pssm_sov) in results:
            line="%s,%.2f,%.2f,%.2f,%.2f\n" % (seqID,hmm_acc,pssm_acc,hmm_sov,pssm_sov)
            f.write(line)

    # write alignments to kerasnet file
    out_knet="cv%i.knet" % i
    with open(os.path.join(out_folder,out_knet),'w+') as f:
        for (seqID, seq,dssp,hmm_pred,pssm_pred) in alignments:
            f.write("seqID     : %s\n" % seqID)
            f.write("sequence  : %s\n" % seq)
            f.write("DSSP      : %s\n" % dssp)
            f.write("HMM_pred  : %s\n" % hmm_pred)
            f.write("PSSM_pred : %s\n" % pssm_pred)