# Shared functions

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import subprocess

In [2]:
# Write function calc() that outputs:
# 1. HMM Q3 accuracy (N matches/N total)
# 2. PSSM Q3 accuracy
# 3. HMM SOV (call calSOV binary from python) 
# 4. PSSM SOV
# 
# Given the following arguments
# 1. sequence
# 2. dssp
# 3. hmm prediction
# 4. pssm prediction

def calc(outfolder,scopeID,seq,dssp,hmm_pred,pssm_pred): 
    n=len(seq)
    assert n!=0
    assert all(_==n for _ in [len(dssp),len(hmm_pred),len(pssm_pred)])
    hmm_pred=hmm_pred.replace('-','C')
    pssm_pred=pssm_pred.replace('-','C')
    dssp=dssp.replace('-','C')
    # write two sov.in file
    hmm_sov_in=os.path.join(outfolder,'%s.hmm.sov.in' % scopeID)
    pssm_sov_in=os.path.join(outfolder,'%s.pssm.sov.in' % scopeID)
    with open(hmm_sov_in,'w+') as f_hmm, open(pssm_sov_in,'w+') as f_pssm:
        f_hmm.write("AA  OSEC PSEC\n")
        f_pssm.write("AA  OSEC PSEC\n")
        for (aa,osec,psec_hmm,psec_pssm) in zip(seq,dssp,hmm_pred,pssm_pred): # no commas allowed
            f_hmm.write("%s   %s    %s\n" % (aa,osec,psec_hmm))
            f_pssm.write("%s   %s    %s\n" % (aa,osec,psec_pssm))

    hmm_sov_out=os.path.join(outfolder,'%s.hmm.sov.out' % scopeID)
    pssm_sov_out=os.path.join(outfolder,'%s.pssm.sov.out' % scopeID)

    cmd_hmm="calSOV -f 1 %s -o %s" % (hmm_sov_in,hmm_sov_out) # must have calSOV installed and path configured
    cmd_pssm="calSOV -f 1 %s -o %s" % (pssm_sov_in,pssm_sov_out)
    proc_hmm=subprocess.run(args=cmd_hmm.split())
    proc_pssm=subprocess.run(args=cmd_pssm.split())

    assert proc_hmm.returncode==0
    assert proc_pssm.returncode==0
    assert os.path.exists(hmm_sov_out)
    assert os.path.exists(pssm_sov_out)

    with open(hmm_sov_out,'r') as f:
        lines=f.read().splitlines()
        ans=lines[1].split()
        hmm_sov,hmm_acc=ans[1],ans[10]
        
    assert os.path.exists(pssm_sov_out)
    
    with open(pssm_sov_out,'r') as f:
        lines=f.read().splitlines()
        ans=lines[1].split()
        pssm_sov,pssm_acc=ans[1],ans[10]
    
    assert(_!="nan" & _!="-nan" for _ in [hmm_sov,hmm_acc,pssm_sov,pssm_acc])    
    
#    del_cmd='rm %s %s %s %s' % (hmm_sov_in, hmm_sov_out, pssm_sov_in, pssm_sov_out)
#    proc_del=subprocess.run(args=del_cmd.split())
#    assert proc_del.returncode==0    
    
    return tuple([float(v) for v in (hmm_acc,pssm_acc,hmm_sov,pssm_sov)])

# Keras

In [4]:
for i in range(1,8):

    # read hmm predictions
    hmm_preds_pkl="/cluster/gjb_lab/2472402/outputs/keras_train_CV/HMM/20Aug/cv%d/results.pkl" % i
    assert os.path.exists(hmm_preds_pkl)
    with open(hmm_preds_pkl,'rb') as f:
        (_,(scopeIDs,_,_,hmm_preds))=pickle.load(f)

    # read pssm predictions
    pssm_preds_pkl="/cluster/gjb_lab/2472402/outputs/keras_train_CV/PSSM/20Aug/cv%d/results.pkl" % i    
    assert os.path.exists(pssm_preds_pkl)
    with open(pssm_preds_pkl,'rb') as f:
        (_,(scopeIDs2,_,_,pssm_preds))=pickle.load(f)
    
    assert all([s1==s2 for (s1,s2) in zip(scopeIDs,scopeIDs2)])
    results=[]
    alignments=[]

    for (scopeID,hmm_probabilities,pssm_probabilities) in zip(scopeIDs,hmm_preds,pssm_preds):

        # f: [0.005 0.005 0.98] -> 2, [0.3,0.4,0.3] -> 1, [0.8, 0.1, 0.1] -> 0
        # g: [2,1,0] -> ['-','E','H']
        # h: ['H','H','E,'C','H'] -> 'HHE-H'
        f=lambda arg: np.apply_along_axis(np.argmax,1,arg) 
        g=lambda args: [['H','E','-'][arg] for arg in args] 
        h=lambda arg: ''.join(arg) 

        # convert probability distribution into prediction string
        hmm_pred=h(g(f(hmm_probabilities)))
        pssm_pred=h(g(f(pssm_probabilities)))

        data_folder="/homes/adrozdetskiy/Projects/jpredJnet231ReTrainingSummaryTable/data/training/"
        sec_file=data_folder+scopeID+'.sec'

        assert os.path.exists(sec_file)
        with open(sec_file,'r') as f:
            sec=f.read().splitlines()
            seq=sec[0].replace("SEQUENCE:","").replace(",","")
            dssp=sec[1].replace("DSSP:","").replace("B","E").replace("I","-").replace("G","-").replace("S","-").replace("T","-").replace(",","")

        # check sanity of input to calc()
        print("scopeID:   %s" % scopeID)
        print("DSSP:      %s" % dssp)
        print("HMM pred:  %s" % hmm_pred)
        print("PSSM pred: %s" % pssm_pred)

        sov_folder='/cluster/gjb_lab/2472402/sov/keras/20Aug/cv%d/' % i
        assert os.path.exists(sov_folder)
        (hmm_acc,pssm_acc,hmm_sov,pssm_sov)=calc(sov_folder,scopeID,seq,dssp,hmm_pred,pssm_pred)

        # check sanity of output of calc()
        print("output of calc: ")
        print("HMM acc:   %s" % hmm_acc)
        print("PSSM acc:  %s" % pssm_acc)
        print("HMM SOV:   %s" % hmm_sov)
        print("PSSM SOV:  %s" % pssm_sov)

        #store results
        results.append((scopeID,hmm_acc,pssm_acc,hmm_sov,pssm_sov))
        alignments.append((scopeID,seq,dssp,hmm_pred,pssm_pred))


    # output results to csv file
    out_file="/cluster/gjb_lab/2472402/results/keras/20Aug/cv%i_scores.csv" % i
    with open(out_file,'w+') as f:
        f.write("scopeID,HMM_acc,PSSM_acc,HMM_sov,PSSM_sov\n")
        for (scopeID,hmm_acc,pssm_acc,hmm_sov,pssm_sov) in results:
            line="%s,%.2f,%.2f,%.2f,%.2f\n" % (scopeID,hmm_acc,pssm_acc,hmm_sov,pssm_sov)
            f.write(line)

    # write alignments to kerasnet file
    align_file="/cluster/gjb_lab/2472402/results/keras/20Aug/cv%i.knet" % i
    with open(align_file,'w+') as f:
        for (scopeID, seq,dssp,hmm_pred,pssm_pred) in alignments:
            f.write("scopeID   : %s\n" % scopeID)
            f.write("sequence  : %s\n" % seq)
            f.write("DSSP      : %s\n" % dssp)
            f.write("HMM_pred  : %s\n" % hmm_pred)
            f.write("PSSM_pred : %s\n" % pssm_pred)

scopeID:   d2ia7a1
DSSP:      ------HHHHHHHHHHHHH--------------------------HHHHHHHHHHHHHHHHHH---EEEEEEEEE-------EEEEEEEEEE----EEEEEEEEEE-----
HMM pred:  --EEEHHHHHHHHHHHHHH------------H---HHHHH-----HHHHHHHHHHHHHHHHHH---EEEEEEEEEE------EEEEEEEEEEE----EEEEEEEEEE----
PSSM pred: -EEEEE-HHHHHHHHHHHH------EEE-------HHHHH-----HHHHHHHHHHHHHHHHH----EEEEEEEEEE-----EEEEEEEEEEEEE---EEEEEEEEEE----
output of calc: 
HMM acc:   87.39
PSSM acc:  81.08
HMM SOV:   68.1
PSSM SOV:  64.05
scopeID:   d3rq7a1
DSSP:      -HHHHHHHHHHHHH--------------E---------EEEEEEE----EEEEEE----EEEEE----EEEE-----EEEEE-----EEEEE--------HHHHHHHHHHHHHHHH---------
HMM pred:  --HHHHHHHHHHH----HH-----------------EEEEEEEE-----EEEEEE----EEEEE----EEEE-----EEEEEE----EEEE------HHHHHHHHHHHHHHHHHHHHH-------
PSSM pred: -HHHHHHHHHHHH---HHH-----------------EEEEEEEE-----EEEEEEE---EEEEE----EEEEE----EEEEEE------EE------HHHHHHHHHHHHHHHHHHHHH-------
output of calc: 
HMM acc:   88.0
PSSM acc:  84.8
HMM SOV:   86.09
PSSM SOV:  83.16
scopeID:   d3rj

# SNNS

In [5]:
for i in range(1,8):

    # read SCOPe domain names into a list
    data_folder="/cluster/gjb_lab/2472402/data/cross-val/train_patterns/cv%d/" % i
    scopeIDs=[filename.rstrip('.pkl') for filename in os.listdir(data_folder)]
    jnet_folder="/homes/adrozdetskiy/Projects/jpredJnet231ReTrainingSummaryTable/data/training/"
    
    results=[]
    alignments=[]

    for scopeID in scopeIDs:
        
        jnet_path=jnet_folder+scopeID+'.jnet'
        with open(jnet_path,'r') as f:
            lines=f.read().splitlines()
            hmm_pred=lines[6].replace("JNETHMM:","").replace(",","")
            pssm_pred=lines[7].replace("JNETPSSM:","").replace(",","")
        
        data_folder="/homes/adrozdetskiy/Projects/jpredJnet231ReTrainingSummaryTable/data/training/"
        sec_file=data_folder+scopeID+'.sec'

        assert os.path.exists(sec_file)
        with open(sec_file,'r') as f:
            sec=f.read().splitlines()
            seq=sec[0].replace("SEQUENCE:","").replace(",","")
            dssp=sec[1].replace("DSSP:","").replace("B","E").replace("I","-").replace("G","-").replace("S","-").replace("T","-").replace(",","")

        # check sanity of input to calc()
        # print("scopeID:   %s" % scopeID)
        # print("DSSP:      %s" % dssp)
        # print("HMM pred:  %s" % hmm_pred)
        # print("PSSM pred: %s" % pssm_pred)

        sov_folder='/cluster/gjb_lab/2472402/sov/snns/20Aug/cv%d/' % i
        assert os.path.exists(sov_folder)
        (hmm_acc,pssm_acc,hmm_sov,pssm_sov)=calc(sov_folder,scopeID,seq,dssp,hmm_pred,pssm_pred)

        # check sanity of output of calc()
        # print("output of calc: ")
        # print("HMM acc:   %s" % hmm_acc)
        # print("PSSM acc:  %s" % pssm_acc)
        # print("HMM SOV:   %s" % hmm_sov)
        # print("PSSM SOV:  %s" % pssm_sov)

        #store results
        results.append((scopeID,hmm_acc,pssm_acc,hmm_sov,pssm_sov))
        alignments.append((scopeID,seq,dssp,hmm_pred,pssm_pred))


    # output results to csv file
    out_file="/cluster/gjb_lab/2472402/results/snns/20Aug/cv%i_scores.csv" % i
    with open(out_file,'w+') as f:
        f.write("scopeID,HMM_acc,PSSM_acc,HMM_sov,PSSM_sov\n")
        for (scopeID,hmm_acc,pssm_acc,hmm_sov,pssm_sov) in results:
            line="%s,%.2f,%.2f,%.2f,%.2f\n" % (scopeID,hmm_acc,pssm_acc,hmm_sov,pssm_sov)
            f.write(line)

    # write alignments to kerasnet file
    align_file="/cluster/gjb_lab/2472402/results/snns/20Aug/cv%i.knet" % i
    with open(align_file,'w+') as f:
        for (scopeID, seq,dssp,hmm_pred,pssm_pred) in alignments:
            f.write("scopeID   : %s\n" % scopeID)
            f.write("sequence  : %s\n" % seq)
            f.write("DSSP      : %s\n" % dssp)
            f.write("HMM_pred  : %s\n" % hmm_pred)
            f.write("PSSM_pred : %s\n" % pssm_pred)