In [2]:
import urllib.request
import gzip
from tqdm import tqdm as prog_bar

import random
random.seed(10)

from pathlib import Path

In [2]:
def download_file(url, save_path):
    urllib.request.urlretrieve(url, save_path)

In [10]:
#download the most recent gene ontology (obo file)
url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
save_paht = 'go-basic.obo'
download_file()
print("Ontology downloaded")

#download the most recent Human protein association file
url = 'http://geneontology.org/gene-associations/goa_human.gaf.gz'
save_path = 'goa_human.gaf.gz'
download_file(url, save_path)
print("Human association file downloaded")

#download the most recent Yeast protein association file
url = 'http://current.geneontology.org/annotations/sgd.gaf.gz'
save_path = 'sgd.gaf.gz'
download_file(url, save_path)
print("Yeast association file downloaded")

In [13]:
#download interaction data for Yeast (4932)
url = 'https://stringdb-static.org/download/protein.links.v11.5/4932.protein.links.v11.5.txt.gz'
save_path = '4932.protein.links.v11.5.txt.gz'
download_file(url, save_path)
print("Yeast STRING-DB interaction file downloaded")

#download interaction data for Human (9606)
url = 'https://stringdb-static.org/download/protein.links.v11.5/9606.protein.links.v11.5.txt.gz'
save_path = '9606.protein.links.v11.5.txt.gz'
download_file(url, save_path)
print("Human STRING-DB interaction file downloaded")

Yeast STRING-DB interaction file downloaded
Humna STRING-DB interaction file downloaded


In [4]:
#download protein aliases for Yeast proteins (4932)
url = 'https://stringdb-static.org/download/protein.aliases.v11.5/4932.protein.aliases.v11.5.txt.gz'
save_path = '4932.protein.aliases.v11.5.txt.gz'
download_file(url, save_path)
print("Yeast STRING-DB aliases file downloaded")

#download protein aliases for Human portiens (9606)
url = 'https://stringdb-static.org/download/protein.aliases.v11.5/9606.protein.aliases.v11.5.txt.gz'
save_path = '9606.protein.aliases.v11.5.txt.gz'
download_file(url, save_path)
print("Human STRING-DB aliases file downloaded")

Yeast STRING-DB aliases file downloaded
Human STRING-DB aliases file downloaded


In [6]:
def save_interactions(filename, interactions):
    with open(filename, 'w') as f:
        for protA, protB in interactions:
            f.write(f'{protA}\t{protB}\n')

In [7]:
def generate_dataset(protein_links_path, poz_intr_file_name, neg_intr_file_name, score_treshold = 700):
    self_ppis = 0
    all_prot_nr = 0
    poz_proteins = []
    pozitive_intr = []
    negative_intr = []
    intr_sets = {}
    line_nr = 0
    
    # POSITIVE INTERACTIONS
    print("Filtering POSITIVE interactions...")
    with gzip.open(protein_links_path, 'rt') as f:
        next(f) # skip information about file creation etc.
        for line in f:
            line_nr += 1
            protA, protB, score = line.strip().split()
                
            #empty sets for each protein    
            intr_sets[protA] = intr_sets.get(protA, set())
            intr_sets[protB] = intr_sets.get(protB, set())
    
            #add only interactions over treshold and if it has not been added before
            if float(score) >= score_treshold and protA not in intr_sets[protB]:
                pozitive_intr.append((protA, protB))
                poz_proteins.append(protA)
                poz_proteins.append(protB)
            
                if protA == protB:
                    self_ppis += 1
            
            #add each interaction partner seen in STRING-DB   
            if protA not in intr_sets[protB]:
                intr_sets[protA].add(protB)
                intr_sets[protB].add(protA)
                
                
    print("Self interactions in STRING-DB file:" , self_ppis)
    print('Total number of positive interactions in STRING-DB file:', line_nr)
    print(f'Total number of positive interactions with confidence >= {score_treshold}:', len(pozitive_intr))
    print('Total number of proteins in the selected positive interactions:', len(set(poz_proteins)), "\n")
    
    print('Saving POSITIVE interactions to files...')
    save_interactions(poz_intr_file_name, pozitive_intr)
    
        
    # NEGATIVE INTERACTIONS
    print("Generating NEGATIVE interactions that do not appear in STRING-DB (regardless of confidence score)...")
    neg_proteins = []   
    while len(negative_intr) < len(pozitive_intr):
        protA, protB  = random.sample(poz_proteins, 2)
        
        #check if interactions has already been generated
        if (protA,protB) in negative_intr or (protB,protA) in negative_intr:
            continue
                
        #check if interaction is not present in STRING-DB
        if protA not in intr_sets[protB]:
            negative_intr.append((protA, protB))
            neg_proteins.append(protA)
            neg_proteins.append(protB)

            if len(negative_intr) % 20000 == 0:
                print("Negative interactions added: ", len(negative_intr), f"/{len(pozitive_intr)}")
            
    print('Total number of negative interactions', len(negative_intr))
    print('Total number of proteins in the selected positive interactions:', len(set(neg_proteins)), "\n")
    
    print('Saving NEGATIVE interactions to files...')
    save_interactions(neg_intr_file_name, negative_intr)

In [9]:
organism = 9606
folder = 'interaction-datasets'
Path(folder).mkdir(parents=True, exist_ok=True)

generate_dataset(f'stringDB-files/{organism}.protein.links.v11.5.txt.gz', f'{folder}/{organism}.protein.links.v11.5.txt', f'{folder}/{organism}.protein.negative.v11.5.txt')

Filtering POSITIVE interactions...
Self interactions in STRING-DB file: 0
Total number of positive interactions in STRING-DB file: 11938498
Total number of positive interactions with confidence >= 700: 252984
Total number of proteins in the selected positive interactions: 16814 

Saving POSITIVE interactions to files...
Generating NEGATIVE interactions that do not appear in STRING-DB (regardless of confidence score)...
Negative interactions added:  20000 /252984
Negative interactions added:  40000 /252984
Negative interactions added:  60000 /252984
Negative interactions added:  80000 /252984
Negative interactions added:  100000 /252984
Negative interactions added:  120000 /252984
Negative interactions added:  140000 /252984
Negative interactions added:  160000 /252984
Negative interactions added:  180000 /252984
Negative interactions added:  200000 /252984
Negative interactions added:  220000 /252984
Negative interactions added:  240000 /252984
Total number of negative interactions 252