In [1]:
import sys
sys.path.insert(0,'../part1')

from utils import *
import pandas as pd

# Constant
PSIBLAST_PATH = '../../results/psiblast_search.txt'
PDB_PATH =  '../../datasets/pdb.cvs'

In [2]:
# Load domain positions identified by psiblast

psiblast_sh2_positions = {} # dict { acc : list of dicts { start : start_pos , end : end_pos } }
with open(PSIBLAST_PATH) as f:
    for line in f:
        if len(line)>1:
            qseqid, sseqid, pident, length, mismatch, gapopen, \
            qstart, qend, sstart, send, evalue, bitscore = line.strip().split()
            if sseqid not in psiblast_sh2_positions:
                psiblast_sh2_positions[sseqid] = [{'start':int(sstart), 'end':int(send)}]
            else:
                pos = {'start':int(sstart), 'end':int(send)}
                # check if the position has alredy been inserted
                # otherwise insert it in the dictionary
                if pos in psiblast_sh2_positions[sseqid]:
                    # position already inserted
                    continue
                else:
                    psiblast_sh2_positions[sseqid].append(pos)
        else:
            break

In [3]:
psiblast_sh2_positions['P12931']

[{'start': 152, 'end': 246}]

In [4]:
# Load pdb dataset
pdb = pd.read_csv(PDB_PATH)
pdb.head()

Unnamed: 0,pdb,sp_primary,chain,sp_beg,sp_end
0,1a07,P12931,A,144,249
1,1a07,P12931,B,144,249
2,1a08,P12931,A,144,249
3,1a08,P12931,B,144,249
4,1a09,P12931,A,144,249


In [5]:
pdb[['sp_beg']].isna().any()

sp_beg    False
dtype: bool

In [6]:
pdb[['sp_end']].isna().any()

sp_end    False
dtype: bool

In [7]:
def filter_pdb(domain, pdb, threshold):
    # create list of positions covered by the domain
    domain_pos = createPositionSet(domain)
    
    pos_from_pdb = lambda pos: set(i for i in range(pos[0],pos[1]+1))
    
    indices = []
    for i, pos in enumerate(pdb):
        # create list of positions covered by the pdb 
        pdb_pos = pos_from_pdb(pos)
        
        overlap = pdb_pos.intersection(domain_pos)
        if len(overlap):
            true_positive = len(overlap)
            false_negative = len(domain_pos) - len(overlap)
            false_positive = len(pdb_pos) - len(overlap)

            precision = true_positive / (true_positive + false_positive)
            sensitivity = true_positive / (true_positive + false_negative)
            f1 = 2 * (precision * sensitivity) / (precision + sensitivity)

            if f1 > threshold: 
                indices.append(i)
        
    return indices

In [8]:
covering_pdbs = []

for protein in psiblast_sh2_positions:
    # extract all the domain matches
    domain_extremes = psiblast_sh2_positions[protein]
    # extract all the pdbs available for the protein
    pdb_extremes = pdb.loc[pdb.sp_primary == protein, ['sp_beg', 'sp_end']].values
    pdb_ids = pdb.loc[pdb.sp_primary == protein, ['pdb']].values
    # if there are available pdbs, check if they cover a domain 
    if len(pdb_extremes):
        for domain in domain_extremes:
            indices = filter_pdb(domain_extremes, pdb_extremes, 0.1)
            covering_pdbs.extend(pdb_ids[indices].tolist())
            
covering_pdbs = set([x[0] for x in covering_pdbs])
print('Number of pdbs covering SH2 domain:', len(covering_pdbs))

Number of pdbs covering SH2 domain: 352


In [9]:
with open('../../datasets/covering_pdb.txt', 'w') as fout:
    for pdb_id in covering_pdbs:
        fout.write("{}\n".format(pdb_id))