# Imports

In [128]:
from IPython.display import clear_output # for clear cell display
import pandas as pd
import numpy as np
import time

## Read train biological process

In [129]:
def get_df(address):
    df = pd.read_pickle(address)
    print(df.shape)    
    return df

## Select only columns 1, 3, 4, 8

In [130]:
def select_columns(df):
    df = df.iloc[:,[0,2,3,7]]        
    return df

## Keep only proteins  that exist in PPPI network

In [131]:
ppi = pd.read_csv('./data/PPI.csv')
ppi = pd.concat([ppi['protein1'], ppi['protein2']]).unique()

def common_protein(df):    
    df = df[df.iloc[:,0].apply(lambda x: x in ppi)]
    print(df.shape)
    return df

## All pairs

In [132]:
def get_pairs(df):
    start_time = time.time()
    proteins = np.array(df.iloc[:,0])
    works = np.array(df.iloc[:,1])
    p1 = list()
    p2 = list()
    w = list()
    for i in range(0,df.shape[0]):    
        for j in range(i+1,df.shape[0]):
            p1.append(proteins[i])        
            p2.append(proteins[j])
            w.append(np.dot(works[i],works[j]))                
        if(i%100==0 and i >0):
            print(i)
            clear_output(wait=True) 
            
    df = pd.DataFrame({'protein1':p1,'protein2':p2,'works':w}, columns = ['protein1', 'protein2','works'])    
    print("--- %s seconds ---" % round(time.time() - start_time,2))    
    return df

## Run for biological process

In [133]:
df = get_df('./data/train-bp.pkl')
df.head(1)

(36380, 8)


Unnamed: 0,accessions,gos,labels,ngrams,proteins,sequences,orgs,embeddings
36224,P81928,[GO:0007275],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4225, 4490, 1799, 3969, 7366, 3315, 2295, 588...",140U_DROME,MNFLWKGRRFLIAGILPTFEGAADEIVDKENKTYKAFLASKPPEET...,7227,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [134]:
df = select_columns(df)
df.head(1)

Unnamed: 0,accessions,labels,ngrams,embeddings
36224,P81928,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4225, 4490, 1799, 3969, 7366, 3315, 2295, 588...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [135]:
df = common_protein(df)
df.head(1)

(3834, 4)


Unnamed: 0,accessions,labels,ngrams,embeddings
42202,P62258,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[4043, 855, 1084, 5663, 1250, 998, 3960, 7194,...","[-0.707461, -0.440626, 0.28608, -1.23422, 0.47..."


In [136]:
df = get_pairs(df)
df.to_csv('./data/workers_bp.csv')
print(df.head(5))

--- 28.24 seconds ---
  protein1 protein2  works
0   P62258   Q04917     64
1   P62258   P61981     52
2   P62258   P31947     83
3   P62258   P27348     59
4   P62258   P63104     57


# Run for cellular components

In [137]:
df = get_df('./data/train-cc.pkl')
df.head(1)

(35546, 8)


Unnamed: 0,accessions,gos,labels,ngrams,proteins,sequences,orgs,embeddings
12122,P0DJZ0,[GO:0030430],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4272, 5432, 4637, 4737, 6726, 6511, 2203, 405...",11K_PAVHV,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,648237,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [138]:
df = select_columns(df)
df.head(1)

Unnamed: 0,accessions,labels,ngrams,embeddings
12122,P0DJZ0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4272, 5432, 4637, 4737, 6726, 6511, 2203, 405...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [139]:
df = common_protein(df)
df.head(1)

(4420, 4)


Unnamed: 0,accessions,labels,ngrams,embeddings
38960,P62258,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4043, 855, 1084, 5663, 1250, 998, 3960, 7194,...","[-0.707461, -0.440626, 0.28608, -1.23422, 0.47..."


In [140]:
df = get_pairs(df)
df.to_csv('./data/workers_cc.csv')
print(df.head(5))

--- 44.47 seconds ---
  protein1 protein2  works
0   P62258   Q04917     20
1   P62258   P61981     24
2   P62258   P31947     23
3   P62258   P27348     24
4   P62258   P63104     26


## Run for molecular function

In [141]:
df = get_df('./data/train-mf.pkl')
df.head(1)

(25224, 8)


Unnamed: 0,accessions,gos,labels,ngrams,proteins,sequences,orgs,embeddings
14149,P32234,[GO:0005525],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4317, 6328, 6550, 2984, 3669, 1368, 3356, 310...",128UP_DROME,MSTILEKISAIESEMARTQKNKATSAHLGLLKAKLAKLRRELISPK...,7227,"[-2.77645, -0.534929, 1.3827, -1.22483, -1.203..."


In [142]:
df = select_columns(df)
df.head(1)

Unnamed: 0,accessions,labels,ngrams,embeddings
14149,P32234,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4317, 6328, 6550, 2984, 3669, 1368, 3356, 310...","[-2.77645, -0.534929, 1.3827, -1.22483, -1.203..."


In [143]:
df = common_protein(df)
df.head(1)

(3153, 4)


Unnamed: 0,accessions,labels,ngrams,embeddings
20966,P62258,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...","[4043, 855, 1084, 5663, 1250, 998, 3960, 7194,...","[-0.707461, -0.440626, 0.28608, -1.23422, 0.47..."


In [None]:
df = get_pairs(df)
df.to_csv('./data/workers_mf.csv')
print(df.head(5))

--- 22.91 seconds ---
