In [3]:
from Bio import SeqIO
import math

In [4]:
# Sequences really containing PF0018
fasta_sequences = SeqIO.parse(open('Swiss_Human/PF00018_human.fasta'),'fasta')
proteins_true = []
for fasta in fasta_sequences:
    name = fasta.id
    proteins = name.split("|")[1]
    print(proteins)
    proteins_true.append(proteins)
    
# check that they are 101
print(len(proteins_true))

Q96HU1
P42681
Q15642
Q13882
Q15811
Q92783
Q7Z6B7
O60504
P02549
Q96N96
O75044
P19878
Q99961
Q8TEC5
Q7Z6J0
P19174
P06239
O75962
Q13813
Q12965
Q9NZQ3
Q8NFA2
Q86WV1
O75563
P07948
P98171
Q15080
P12931
O94875
O43295
Q8TEJ3
Q9H6Q3
Q5HYK7
P16885
P20936
Q9BRR9
Q92882
O15259
Q13239
Q96MF2
P16333
P15498
A1X283
Q9BX66
Q5TCZ1
O75886
P42680
Q9Y5X1
P07947
O43639
Q06187
Q9ULH1
Q9UNF0
P06241
Q86UR1
Q08881
P14598
O43307
Q12959
Q9NZM3
Q8N157
P62993
P08631
P00519
P14317
P42684
Q9H6S3
Q6XZF7
P51451
O94868
O75791
P41240
Q8IZP0
P42679
Q8TE68
P09769
P55345
P42685
Q92796
Q9NR80
Q96KQ4
Q9Y5K6
Q5T0N5
P56945
P46108
Q9H7D0
Q13625
Q12929
P78352
P46109
Q13588
Q8N5V2
Q8TE67
Q8TF17
Q96HL8
Q99469
A6NI72
A8MVU1
A1IGU5
Q8TE82
Q8TC17
101


In [5]:
# Sequences retrieved by PSIBLAST
psi = open("results/psiblast_out.txt", "r")
proteins_psi = []
for line in psi:
    if line[0]==">":
        proteins_psi.append(line[1:7])
        
# How many?
print(len(proteins_psi))

107


In [6]:
# True positive
TP = len(list(set(proteins_true).intersection(proteins_psi)))
# False Positive
FP=len(proteins_psi)-TP
# False Negative
FN = len(proteins_true)-TP
# True Negative
TN = 20367- (TP + FP + FN)
print("TP: {}\nFP: {}\nFN: {}\nTN:{}".format(TP,FP,FN,TN))

TP: 72
FP: 35
FN: 29
TN:20231


In [15]:
# PSSM metrics
acc = (TP + TN) / (TP + TN + FP + FN)
pre = (TP) / (TP + FP)
sen = TP / (TP + FN)
spe = TN / (TN + FP)
mcc = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

print("PSSM metrics:")
print(f"accuracy: {acc}")
print(f"precision: {pre}")
print(f"sensitivity: {sen}")
print(f"specificity: {spe}")
print(f"mcc: {mcc}")

PSSM metrics:
accuracy: 0.9968576619040604
precision: 0.6728971962616822
sensitivity: 0.7128712871287128
specificity: 0.9982729695055759
mcc: 0.6910202965864407


In [16]:
# Sequences retrieved by HMMER
psi = open("results/hmmsearch_out.hmmer_align", "r")
proteins_hmmer = []
for line in psi:
    if line[0:2]==">>":
        proteins_hmmer.append(line[6:12])
        
# How many?
print(len(proteins_hmmer))

190


In [17]:
# True positive
TP = len(list(set(proteins_true).intersection(proteins_hmmer)))
# False Positive
FP=len(proteins_hmmer)-TP
# False Negative
FN = len(proteins_true)-TP
# True Negative
TN = 20367- (TP + FP + FN)
print("TP: {}\nFP: {}\nFN: {}\nTN:{}".format(TP,FP,FN,TN))

TP: 97
FP: 93
FN: 4
TN:20173


In [18]:
# HMM metrics
acc = (TP + TN) / (TP + TN + FP + FN)
pre = (TP) / (TP + FP)
sen = TP / (TP + FN)
spe = TN / (TN + FP)
mcc = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

print("HMM metrics:")
print(f"accuracy: {acc}")
print(f"precision: {pre}")
print(f"sensitivity: {sen}")
print(f"specificity: {spe}")
print(f"mcc: {mcc}")

HMM metrics:
accuracy: 0.9952373938233416
precision: 0.5105263157894737
sensitivity: 0.9603960396039604
specificity: 0.995411033257673
mcc: 0.6984090341888622


In [10]:
with open('datasets/original.txt', 'w') as f:
    for item in proteins_hmmer:
        f.write("%s\n" % item)