In [1]:
'''Code adapted from: https://github.com/bio-ontology-research-group/machine-learning-with-ontologies'''

import gzip
interactions = {}
data = []
# Load data
org_id = '9606' # Change to 9606 for Human

lines = 0
self_ppis = 0
with gzip.open(f'data/{org_id}.protein.links.v11.0.txt.gz', 'rt') as f:
    next(f) # Skip header
    for line in f:
        lines += 1
        p1, p2, score = line.strip().split()
        if float(score) < 700: # Filter high confidence interactions
            continue
        if p1 not in interactions:
            interactions[p1] = set()
        if p2 not in interactions:
            interactions[p2] = set()
        if p2 not in interactions[p1]:
            interactions[p1].add(p2)
            interactions[p2].add(p1)
            data.append((p1, p2))

        if p1 == p2:
            self_ppis += 1
print("self ppis:" , self_ppis)
print('Total number of interactions confidence > 700:', len(data))
print('Total number of proteins:', len(interactions.keys()))
print('Total number of interactions in string file:', lines)

self ppis: 0
Total number of interactions confidence > 700: 420534
Total number of proteins: 17185
Total number of interactions in string file: 11759454


In [10]:
import numpy as np
import math

np.random.seed(seed=0) # Fix random seed for reproducibility
np.random.shuffle(data)
train_n = int(math.ceil(len(data) * 0.8))
valid_n = int(math.ceil(train_n * 0.8))
train_data = data[:valid_n]
valid_data = data[valid_n:train_n]
test_data = data[train_n:]
print('Number of training interactions:', len(train_data))
print('Number of validation interactions:', len(valid_data))
print('Number of testing interactions:', len(test_data))

Number of training interactions: 76193
Number of validation interactions: 19048
Number of testing interactions: 23810


In [11]:
def save(filename, data):
    with open(filename, 'w') as f:
        for p1, p2 in data:
            f.write(f'{p1}\t{p2}\n')
            #f.write(f'{p2}\t{p1}\n')

save(f'data/train/{org_id}.no-mirror.protein.links.v11.0.txt', train_data)
save(f'data/valid/{org_id}.no-mirror.protein.links.v11.0.txt', valid_data)
save(f'data/test/{org_id}.no-mirror.protein.links.v11.0.txt', test_data)

In [12]:
import random
proteins =set ()
negatives = []
for (p1,p2) in data:
        proteins.add(p1)
        proteins.add(p2)
while len(negatives)<len(data):
        if(len(negatives) % 10000 == 0):
            print(len(negatives))
        s = random.sample(proteins, 2)
        prot1= s[0]
        prot2= s[1]
        if (prot1,prot2) in negatives or (prot2,prot1) in negatives :
                 continue
        if prot1 not in interactions[prot2]:
                 negatives.append((prot1, prot2))
print('Total number of negative interactions:', len(negatives))
# Split negative data
neg_train_data = negatives[:valid_n]
neg_valid_data = negatives[valid_n:train_n]
neg_test_data = negatives[train_n:]
print(len(negatives))
print('Number of negative training interactions:', len(neg_train_data))
print('Number of negative validation interactions:', len(neg_valid_data))
print('Number of negative testing interactions:', len(neg_test_data))

0


since Python 3.9 and will be removed in a subsequent version.
  s = random.sample(proteins, 2)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
110000
Total number of negative interactions: 119051
119051
Number of negative training interactions: 76193
Number of negative validation interactions: 19048
Number of negative testing interactions: 23810


In [13]:
save(f'data/train/{org_id}.no-mirror.negative_interactions.txt', neg_train_data)
save(f'data/valid/{org_id}.no-mirror.negative_interactions.txt', neg_valid_data)
save(f'data/test/{org_id}.no-mirror.negative_interactions.txt', neg_test_data)