In [1]:
## Program to read in interacting residues from Burke et al. catalogued structures and annotate with IUPred2A disorder prediction
## Created by: Joelle Strom
## Last updated: 17.05.2024

import numpy as np
import pandas as pd
import requests
from tqdm import tqdm

In [3]:
def extr_int_res(name):
    """ Find output file for given Burke-modeled interaction and return locations of interacting residues in each chain """
    
    # Define file path with given interaction name and read contents
    filename = 'Y:/publications/AlphaFold/Burke_et_al_NSMB2023/HuRI/'+name+'/'+name+'.pLDDT'
    with open(filename) as f:
        contents = f.readlines()

    # Iterate through contents - after the 'list of interacting residues' header, append residue numbers to list
    prota = []
    protb = []
    restime = 0
    for line in contents:
        if restime == 1:
            curline = line.strip('\n').split(' ')
            prota.append(curline[1])
            protb.append(curline[2])
        if 'LIST OF INTERACTING RESIDUES' in line:
            restime = 1
            
    # Minor processing of lists - remove duplicates and convert to integers
    prota = list(dict.fromkeys(prota))
    prota = [int(x) for x in prota]
    protb = list(dict.fromkeys(protb))
    protb = [int(x) for x in protb]
    
    return prota, protb

# Import file with annotated data
final = pd.read_csv('C:/Users/stromjoe/Documents/projects/Burke/Processed_datasets/burkeAnnotated.csv')
# Filter out low-confidence models
highconf = final[final.pDockQ > 0.5]
print(highconf.shape[0])

3009


In [6]:
# Obtain disorder predictions for interface residues
if_numfrac1 = []
if_numfrac2 = []
if_numres1 = []
if_numres2 = []

for i in tqdm(highconf.index):
    # Get interacting residues and uniprot IDs for current interaction
    prota, protb = extr_int_res(highconf.Name.loc[i])
    names = [highconf.Uniprot1.loc[i], highconf.Uniprot2.loc[i]]
    numfrac = []
    numres = []
    # Interact with IUPred2A REST API
    for j in range(0,2):
        if j == 0:
            prot = prota
        elif j == 1:
            prot = protb
        if names[j] == 'O76011':
            names[j] = 'A0A140TA69' #Change to the isomer with correct length for this specific case (according to len in Burke data set)
        if names[j] == 'Q9NX55': #Burke sequence has an extended sequence - shift residue numbers to correspond with IUPred
            prot = [x-8 for x in prot]
            prot = [x for x in prot if x > 0]
        url = 'http://iupred2a.elte.hu/iupred2a/short/'+names[j]+'.json'
        response = requests.get(url)
        res = response.json()
        iupred = pd.Series(res['iupred2'])
        if len(iupred.index) > 0:
            iupred.index = iupred.index + 1
            # Restrict disorder predictions to only the residues which are in interface
            # Try-except statement is necessary because some interface residue indices are outside range of UniProt sequence, even with the named exceptions above
            # This is only true for a small fraction (9/3009 cases) - will not take the time to manually fix each of these instances but will rather return NaN
            try:
                iupredres = iupred[prot]
                numfrac.append(len(iupredres[iupredres > 0.4]))
                numres.append(len(prot))
            except:
                print(names[j])
                numfrac.append(np.nan)
                numres.append(len(prot))

    if_numfrac1.append(numfrac[0])
    if_numfrac2.append(numfrac[1])
    if_numres1.append(numres[0])
    if_numres2.append(numres[1])

  6%|▋         | 191/3009 [08:55<2:04:48,  2.66s/it]

Q14241


 43%|████▎     | 1292/3009 [51:45<1:05:00,  2.27s/it]

Q01433


 43%|████▎     | 1293/3009 [51:47<1:05:29,  2.29s/it]

P23109


 43%|████▎     | 1299/3009 [52:01<1:05:53,  2.31s/it]

Q12983


 53%|█████▎    | 1587/3009 [1:02:58<54:03,  2.28s/it]

Q96HE8


 56%|█████▌    | 1676/3009 [1:06:13<52:51,  2.38s/it]

P78358


 76%|███████▌  | 2276/3009 [1:29:10<26:59,  2.21s/it]

P23109


 78%|███████▊  | 2358/3009 [1:32:20<25:40,  2.37s/it]

Q96HE8


 95%|█████████▍| 2846/3009 [1:51:26<06:27,  2.38s/it]

Q8N5Z5


100%|██████████| 3009/3009 [1:57:40<00:00,  2.35s/it]


In [8]:
# Append to dataframe
highconf['if_numres1'] = if_numres1
highconf['if_numres2'] = if_numres2
highconf['if_numdis1'] = if_numfrac1
highconf['if_numdis2'] = if_numfrac2

# Write to disk
highconf.to_csv('C:/Users/stromjoe/Documents/projects/DDI_IF-analysis/highconfpred.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highconf['if_numres1'] = if_numres1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highconf['if_numres2'] = if_numres2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highconf['if_numdis1'] = if_numfrac1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co