# Interaction sets preparation

I want to prepare 4 sets of interactions:

- $C^+$: Conservative criteria with endogenous peptide interactions
- $C^-$: Conservative criteria without endogenous peptide interactions

In [1]:
DATA_DIR = "../data"
INTERACTIONS_DIR = f"{DATA_DIR}/interactions"
PEPTIDES_DIR = f"{DATA_DIR}/peptides"

In [2]:
# STD Library
from xml.etree.ElementTree import fromstring
import json

# 3rd party packages
import requests
import pandas as pd
from Bio.Blast import NCBIWWW
from Bio import SeqRecord, Seq
import xmljson

In [3]:
! [ -f "../data/interactions/biogrid_interactions.tab3.zip" ] && echo "Biogrid dataset already downloaded." || curl https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.3.194/BIOGRID-ALL-4.3.194.tab3.zip -o ../data/interactions/biogrid_interactions.tab3.zip

Biogrid dataset already downloaded.


In [4]:
# Download the BioGRID dataset
interactions_df = pd.read_csv(f"{INTERACTIONS_DIR}/biogrid_interactions.tab3.zip", sep="\t")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
# What columns do we have?
interactions_df.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism ID Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology

In [6]:
# Select useful columns and rename
interactions_df = interactions_df[[
    "SWISS-PROT Accessions Interactor A",
    "SWISS-PROT Accessions Interactor B",
    'Organism ID Interactor A',
    'Organism ID Interactor B',
    'Author',
    'Publication Source',
    "Experimental System"
]].rename(columns={
    "SWISS-PROT Accessions Interactor A": "prot_a",
    "SWISS-PROT Accessions Interactor B": "prot_b",
    'Organism ID Interactor A': "organism_a",
    'Organism ID Interactor B': "organism_b",
    'Author': "author",
    'Publication Source': "source",
    "Experimental System": "system"
})

In [7]:
# Filter out non-human interactions
HUMAN_TAX_ID = 9606
interactions_df = interactions_df[(interactions_df.organism_a == HUMAN_TAX_ID) & (interactions_df.organism_b == HUMAN_TAX_ID)]

In [8]:
# Ensure that prot_a is < prot_b lexicographically (useful for identifying duplicates)
temp_df = interactions_df[["prot_a", "prot_b"]].copy()
interactions_df["prot_a"] = temp_df.min(axis=1)
interactions_df["prot_b"] = temp_df.max(axis=1)

In [9]:
# Remove interactions with invalid uniprot IDs
interactions_df = interactions_df[(~interactions_df.prot_a.str.contains("-")) & (~interactions_df.prot_b.str.contains("-"))]

In [10]:
# Remove interactions where one of the partner is a protein complex
interactions_df = interactions_df[(~interactions_df.prot_a.apply(lambda x: "|" in x)) & (~interactions_df.prot_b.apply(lambda x: "|" in x))]

In [11]:
# I can now create the p+ dataset
p_plus = interactions_df[["prot_a", "prot_b"]].drop_duplicates()
p_plus.to_csv(f"{INTERACTIONS_DIR}/permissive_plus.txt", header=None, index=False, sep=" ")

In [12]:
# According to Positome paper (Dick et al., 2017)
CONSERVATIVE_SYSTEMS = [
    "Two-hybrid",
    "Affinity Capture-MS",
    "Affinity Capture-Western",
    "Reconstituted Complex",
    "Affinity Capture-Luminescence",
    "Co-crystal Structure",
    "Far Western",
    "FRET",
    "Protein-peptide",
    "Co-localization",
    "Affinity Capture-RNA",
    "Co-purification"
]

In [13]:
# Select the interactions detected with one of the conservative (robust) methods
conservative_systems = interactions_df[interactions_df.system.isin(CONSERVATIVE_SYSTEMS)]

In [14]:
# Group the interactions by protein pairs
pair_groups = conservative_systems[["prot_a", "prot_b", "author"]].groupby([conservative_systems.prot_a, conservative_systems.prot_b]).count()

In [15]:
# Get pairs that were reported by more than one author
pairs_with_mlo = pair_groups[pair_groups.author > 1] # mlo: multiple lines of evidence
c_plus = pd.DataFrame({ "prot_a": [p[0] for p in list(pairs_with_mlo.index)], "prot_b": [p[1] for p in list(pairs_with_mlo.index)] })

In [16]:
c_plus.to_csv(f"{INTERACTIONS_DIR}/conservative_plus.txt", header=None, index=False, sep=" ")

OK. At this point, I have my $C^+$ dataset. Now, I will need to identify proteins in the proteome that share high similarity with the peptides so that they can be removed from the interaction set to form the $C^-$ dataset.

In [17]:
# Load the FDA-approved peptides as a dataframe 
peptides = pd.read_csv(f"{PEPTIDES_DIR}/peptide_subset.csv")
peptides.head()

Unnamed: 0,peptide_name,status,first_approval,indications,therapeutic_area,route_of_administration,conjugation,molecular_target,chemical_basis,sequence,targets
0,corticotropin,Approved,1952,Multiple inflammatory diseases; West syndrome,CNS,Subcutaneous,No,MC receptors,native,SYSMEHFRWGKPVGKKRRPVKVYPDGAEDQLAEAFPLEF,Q01726;Q01718;P41968;P32245;P33032
1,calcitonin (salmon),Approved,1971,Hypercalcemia; Paget's disease; osteoporosis,Bones and connective tissues,Intranasal;subcutaneous,No,Calcitonin receptor,native,CSNLSTCVLGKLSQELHKLQTYPRTNTGSGTP,P30988
2,tetracosactide,Approved,1980,Multiple inflammatory diseases,Endocrinology,Subcutaneous,No,MC receptors,native,SYSMEHFRWGKPVGKKRRPVKVYP,Q01726;Q01718;P41968;P32245;P33032
3,calcitonin (human),Approved,1986,Hypercalcemia; Paget's disease; osteoporosis,Bones and connective tissues,Intramuscular;intranasal;subcutaneous,No,Calcitonin receptor,native,CGNLSTCVLGTYTQDFNKFHTFPQTAIGVGAP,Q16602
4,carperitide,Approved,1995,Acute decompensated heart failure,Cardiovascular,Intravenous,No,NPR-A,native,SLRRSSCFGGRMDRIGAQSGLGCNSFRY,P16066


In [18]:
# BLAST all the peptides to make find endogenous versions of the peptides (SLOW TO RUN, SO CACHE RESULTS)
#blast_results = {}
#for i, row in peptides.iterrows():
#    result_handle = NCBIWWW.qblast("blastp", "swissprot", SeqRecord.SeqRecord(row.Sequence, id=peptide_name))
#    blast_results[row["peptide_name"]] = result_handle.read()
#    print(f"Completed {i + 1}/{len(peptides)}...")
#open(f"{PEPTIDES_DIR}/peptide_blast_results.json", "w").write(json.dumps(blast_results))

In [19]:
blast_results = json.load(open(f"{PEPTIDES_DIR}/peptide_blast_results.json"))

In [20]:
# Parse the XML (why XML????) to JSON
blast_results_json = {}
for peptide, result in blast_results.items():
    parser = xmljson.Parker(dict_type=dict)
    blast_results_json[peptide] = parser.data(fromstring(result))

In [21]:
def filter_hits(blast_dict, min_align_len, min_identity):
    """Goes through the hits returned by BLAST and selects the proteins where there
    is a HSP and returns a set with the accession IDs where a HSP is found with alignment length
    >= to min_align_len and identify >= min_identity.
    """
    to_remove = set([])
    for hit in blast_dict["BlastOutput_iterations"]["Iteration"]["Iteration_hits"]["Hit"]:
        protein_name = hit["Hit_def"].split(";")[0].split("Full=")[1]
        hsps = [hit["Hit_hsps"]["Hsp"]] if not isinstance(hit["Hit_hsps"]["Hsp"], list) else hit["Hit_hsps"]["Hsp"]
        for hsp in hsps:
            if hsp["Hsp_align-len"] >= min_align_len and hsp['Hsp_identity'] >= min_identity:
                to_remove.add(hit["Hit_accession"])
                
    return to_remove

In [22]:
# Group all the endogenous analogs of peptides
IDENTITY_THRESHOLD = 0.9 # percentage identity for a HIT

proteins_with_high_identity = set([])
for i, row in peptides.iterrows():
    proteins_with_high_identity.update(filter_hits(blast_results_json[row.peptide_name], len(row.sequence), IDENTITY_THRESHOLD*len(row.sequence)))

In [23]:
c_minus = c_plus[(~c_plus.prot_a.isin(proteins_with_high_identity)) & (~c_plus.prot_b.isin(proteins_with_high_identity))]

In [24]:
# How many interactions did we remove?
len(c_plus) - len(c_minus)

8

In [25]:
# Which pairs did I remove are they?
removed = c_plus[(c_plus.prot_a.isin(proteins_with_high_identity)) | (c_plus.prot_b.isin(proteins_with_high_identity))]
removed["name_a"] = removed["prot_a"].apply(lambda uid: requests.get(f"https://www.ebi.ac.uk/proteins/api/proteins/{uid}.fasta").text.split("|")[2].split(" OS")[0])
removed["name_b"] = removed["prot_b"].apply(lambda uid: requests.get(f"https://www.ebi.ac.uk/proteins/api/proteins/{uid}.fasta").text.split("|")[2].split(" OS")[0])
removed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  removed["name_a"] = removed["prot_a"].apply(lambda uid: requests.get(f"https://www.ebi.ac.uk/proteins/api/proteins/{uid}.fasta").text.split("|")[2].split(" OS")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  removed["name_b"] = removed["prot_b"].apply(lambda uid: requests.get(f"https://www.ebi.ac.uk/proteins/api/proteins/{uid}.fasta").text.split("|")[2].split(" OS")[0])


Unnamed: 0,prot_a,prot_b,name_a,name_b
14726,P01160,P17342,ANF_HUMAN Natriuretic peptides A,ANPRC_HUMAN Atrial natriuretic peptide receptor 3
14727,P01160,P20594,ANF_HUMAN Natriuretic peptides A,ANPRB_HUMAN Atrial natriuretic peptide receptor 2
14728,P01160,P43365,ANF_HUMAN Natriuretic peptides A,MAGAC_HUMAN Melanoma-associated antigen 12
14730,P01189,P32245,COLI_HUMAN Pro-opiomelanocortin,MC4R_HUMAN Melanocortin receptor 4
14748,P01275,P43220,GLUC_HUMAN Pro-glucagon,GLP1R_HUMAN Glucagon-like peptide 1 receptor
23262,P16066,P16860,ANPRA_HUMAN Atrial natriuretic peptide receptor 1,ANFB_HUMAN Natriuretic peptides B
23563,P16860,P17342,ANFB_HUMAN Natriuretic peptides B,ANPRC_HUMAN Atrial natriuretic peptide receptor 3
23564,P16860,P20594,ANFB_HUMAN Natriuretic peptides B,ANPRB_HUMAN Atrial natriuretic peptide receptor 2


In [26]:
c_minus.to_csv(f"{INTERACTIONS_DIR}/conservative_minus.txt", index=False, header=None, sep=" ")