In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data = pd.read_csv('../SENSE-PPI/data/senseppi_data/protein.pairs_9606.tsv', sep='\t', names=['prot1', 'prot2', 'score'])

In [None]:
prots = set(data['prot1']).union(set(data['prot2']))
prot_degree = {prot:0 for prot in prots}
for i, row in data[data['score'] == 1].iterrows():
    prot_degree[row['prot1']] += 1
    prot_degree[row['prot2']] += 1

In [None]:
prots_from_positive = set(data[data['score'] == 1]['prot1']).union(set(data[data['score'] == 1]['prot2']))
prots_from_negative = set(data[data['score'] == 0]['prot1']).union(set(data[data['score'] == 0]['prot2']))

In [None]:
print('Number of proteins in the positive set:', len(prots_from_positive)
, '\nNumber of proteins in the negative set:', len(prots_from_negative))

In [None]:
positive_degrees = []
negative_degrees = []
for i, row in data.iterrows():
    if row['score'] == 1:
        positive_degrees.append(prot_degree[row['prot1']])
        positive_degrees.append(prot_degree[row['prot2']])
    else:
        negative_degrees.append(prot_degree[row['prot1']])
        negative_degrees.append(prot_degree[row['prot2']])

In [None]:
plt.hist(positive_degrees, bins=range(0, 100, 1), alpha=0.9, label='Positive')
plt.hist(negative_degrees, bins=range(0, 100, 1), alpha=0.5, label='Negative')
plt.legend(loc='upper right')
plt.ylim(0, 10000)
# plt.xlim(0, 20)
plt.show()

In [33]:
import argparse
import torch
from torch.utils.data import DataLoader
from dataset import PairSequenceData
import os
from model import AttentionModel

os.environ["TOKENIZERS_PARALLELISM"] = "false"

max_len = 800

dataset_test = PairSequenceData(actions_file="../SENSE-PPI/data/dscript_data/human_test.tsv",
                                sequences_file="../SENSE-PPI/data/dscript_data/human.fasta",
                                max_len=max_len-2)

loader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

print(len(dataset_test))

parser = argparse.ArgumentParser()
parser = AttentionModel.add_model_specific_args(parser)
# parser = pl.Trainer.add_argparse_args(parser)
params, unknown = parser.parse_known_args()

params.max_len = max_len
params.devices = 1
params.accelerator = "cuda"

model = AttentionModel(params, ntoken=len(dataset_test.tokenizer), embed_dim=256)

ckpt = torch.load("logs/AttentionModelBase/version_0/checkpoints/chkpt_loss_based_epoch=0-val_loss=0.253-val_BinaryF1Score=0.231.ckpt")
model.load_state_dict(ckpt['state_dict'])

for i, batch in enumerate(loader_test):
    print(batch)
    pred, w1, w2 = model(batch, need_weights=True)
    # pred = pred[0]
    # w1 = w1[0]
    # w2 = w2[0]
    print(pred)
    if pred > 0.9:
        break

52612
{'tok1': {'input_ids': tensor([[[30, 16, 17,  6,  7,  7,  6,  8, 11,  8,  7,  5,  0, 13, 10, 17,  0,
          16, 19,  3,  0, 19,  2,  2, 15, 14,  8,  9,  1,  7,  7,  8,  8,  5,
          19,  0, 17,  4, 14, 19, 11,  7, 19,  8,  3,  6, 15,  0,  5,  7,  4,
          11, 15, 19, 10,  8, 19,  9,  3, 12,  3, 17, 11,  7,  6, 10,  0,  0,
          13,  8,  6,  4,  9, 11,  7,  0, 12,  9,  8,  3, 15, 11,  0,  3, 10,
           2,  0, 14, 11, 13,  5, 16, 11,  6,  6,  6,  9, 14, 15,  1,  1, 17,
          10,  4,  1,  9,  1,  1, 13,  2,  4, 13,  5,  9,  6,  2,  5,  3,  1,
           9,  5,  9, 11,  6,  8,  8,  7,  9,  9,  5,  8,  8,  4,  0,  4,  8,
           5, 14, 14,  9, 12, 13,  6,  0,  9,  6, 11,  3, 13, 11,  9, 11,  5,
          11,  4, 11,  5,  5,  3, 13,  9, 11,  6, 15,  0,  6, 19, 10,  1,  1,
          16,  7,  3, 16, 17,  0,  6, 11, 14,  0,  6,  4, 11, 16,  9,  8, 10,
          13,  7, 14, 12,  8,  9,  3, 16, 15,  5,  5,  5, 10,  0, 11,  9, 15,
          15,  7,  0, 16,  9,  8,  

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [10]:
from tqdm import tqdm
with open('../SENSE-PPI/protein.physical.links.full.v12.0.txt', 'r') as f:
    with open('string_12.0_700.tsv', 'w') as out:
        line = f.readline()
        out.write(line)
        for line in tqdm(f):
            if int(line.strip().split()[-1]) > 700:
                out.write(line)

2365172266it [20:23, 1932806.30it/s]


In [29]:
import pandas as pd
from tqdm import tqdm
data_700 = pd.read_csv('string_12.0_700.tsv', sep=' ', chunksize=100000)
print(data_700)

<pandas.io.parsers.readers.TextFileReader object at 0x7f28a9615000>


In [15]:
protein_set = set()
for chunk in tqdm(data_700):
    prots = set(chunk['protein1']).union(set(chunk['protein2']))
    protein_set = protein_set.union(prots)

2257it [16:20,  2.30it/s]


In [30]:
print(len(protein_set))

16504586


In [34]:
from Bio import SeqIO
from tqdm import tqdm
with open('protein.sequences.v12.0.fa', 'r') as f:
    with open('string_12.0_700.fasta', 'w') as out:
        for record in tqdm(SeqIO.parse(f, 'fasta')):
            if record.id in protein_set:
                out.write('>' + record.id + '\n' + str(record.seq) + '\n')

59309604it [04:43, 209324.76it/s]


In [35]:
with open('string_12.0_700.fasta', 'r') as f:
    records = list(SeqIO.parse(f, 'fasta'))
    print(len(records))
record_ids = set([record.id for record in records])

16504586


In [36]:
print(len(protein_set))
print(len(record_ids))

16504586
16504586
