In [1]:
import h5py
import json
import numpy as np
import torch as pt
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob

from structures_store import h5_load_structure

In [2]:
uniprots = pd.read_csv("datasets/selected_alphafold_models.csv").query("selected > 0.5")['uniprot'].values

structures = {}
with h5py.File("datasets/aggregated_structures_data.h5", 'r') as hf:
    #for uniprot in tqdm(hf.keys()):
    for uniprot in tqdm(uniprots):
        structures[uniprot] = h5_load_structure(hf[uniprot])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 560125/560125 [29:56<00:00, 311.85it/s]


In [3]:
# uniprot features
features_dict = json.load(open("datasets/uniprot_features.json", 'r'))

# clustered interfaces
multi_interfaces_dict = json.load(open("datasets/clustered_multi_interfaces.json", 'r'))

In [4]:
interfaces_dict = {}
k = -1
for uniprot in tqdm(structures):
    interfaces_dict[uniprot] = []
    for itype in multi_interfaces_dict[uniprot]:
        if '+' not in itype:
            for iids in multi_interfaces_dict[uniprot][itype]:
                k += 1
                interfaces_dict[uniprot].append({'iid':k, 'itype': itype, 'iids': iids})

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 560125/560125 [00:05<00:00, 106586.40it/s]


In [5]:
# aggregate predicted interfaces and mutagenesis sites
labels_dict = {"protein":0, "dna/rna":1, "ion":2, "ligand":3, "lipid":4}

data = []
fid = -1
for uniprot in tqdm(structures):
    if (uniprot in interfaces_dict) and (uniprot in features_dict):
        interfaces = interfaces_dict[uniprot]
        features = features_dict[uniprot]

        for ftr in features:
            fid += 1
            pos = ftr['pos']
            if type(pos) == int:
                fids = np.arange(pos-1, pos)
            else:
                fids = np.arange(pos[0]-1, pos[1])
                
            if structures[uniprot]['afs'].shape[0] <= fids[-1]:
                data.append({
                    'uniprot': uniprot,
                    'fid': fid,
                    'ftype': ftr['ftype'],
                    'pos': ftr['pos'],
                    'iid': -1,
                    'itype': "",
                    'Ni': 0,
                    'Nf': len(fids),
                    'Nc': 0,
                    'mafs': -1.0,
                    'mpi': -1.0,
                    'desc': ftr['desc'],
                })
                continue

            b_no_overlap = True
            if len(interfaces) > 0:
                for interface in interfaces:
                    iids = interface['iids']
                    itype = interface['itype']

                    cids = np.intersect1d(iids, fids)

                    if len(cids) > 0:
                        b_no_overlap = False

                        mpi = np.mean(structures[uniprot]['p{}'.format(labels_dict[itype])][cids])
                        mafs = np.mean(structures[uniprot]['afs'][cids])

                        data.append({
                            'uniprot': uniprot,
                            'fid': fid,
                            'ftype': ftr['ftype'],
                            'pos': ftr['pos'],
                            'iid': interface['iid'],
                            'itype': itype,
                            'Ni': len(iids),
                            'Nf': len(fids),
                            'Nc': len(cids),
                            'mafs': mafs,
                            'mpi': mpi,
                            'desc': ftr['desc'],
                        })
            if b_no_overlap:
                mafs = np.mean(structures[uniprot]['afs'][fids])
                data.append({
                    'uniprot': uniprot,
                    'fid': fid,
                    'ftype': ftr['ftype'],
                    'pos': ftr['pos'],
                    'iid': -1,
                    'itype': "",
                    'Ni': 0,
                    'Nf': len(fids),
                    'Nc': 0,
                    'mafs': mafs,
                    'mpi': -1.0,
                    'desc': ftr['desc'],
                })
            
    # gap check -> bad
    if len(data) > 2:
        assert (data[-1]['fid'] == data[-2]['fid']+1) or (data[-1]['fid'] == data[-2]['fid'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 560125/560125 [12:26<00:00, 750.81it/s]


In [6]:
df = pd.DataFrame(data)
df.to_csv("datasets/curated_interfaces_with_uniprot_human.csv", index=False)
df

Unnamed: 0,uniprot,fid,ftype,pos,iid,itype,Ni,Nf,Nc,mafs,mpi,desc
0,A0A021WW64,0,region of interest,"[36, 67]",-1,,0,32,0,43.111562,-1.000000,Disordered
1,A0A021WW64,1,region of interest,"[220, 251]",-1,,0,32,0,47.713750,-1.000000,Disordered
2,A0A021WW64,2,compositionally biased region,"[226, 244]",-1,,0,19,0,48.143158,-1.000000,Basic and acidic residues
3,A0A023GPK8,3,signal peptide,"[1, 21]",-1,,0,21,0,34.742857,-1.000000,
4,A0A023GPK8,4,chain,"[22, 1447]",6,protein,16,1426,16,86.198126,0.664849,
...,...,...,...,...,...,...,...,...,...,...,...,...
8189644,Z4YNJ9,3503309,domain,"[101, 315]",5561802,ligand,18,215,12,98.497500,0.644665,FA_desaturase
8189645,Z4YNJ9,3503309,domain,"[101, 315]",5561803,ligand,1,215,1,98.500000,0.640303,FA_desaturase
8189646,Z4YNJ9,3503309,domain,"[101, 315]",5561804,lipid,106,215,80,98.355374,0.797019,FA_desaturase
8189647,Z4YNJ9,3503310,region of interest,"[1, 31]",-1,,0,31,0,34.698065,-1.000000,Disordered
