# CAFA 5 Competition : Protein Function Prediction

In [1]:
import pandas as pd
print(pd.__version__)
import numpy as np

# UTILITARIES
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import tensorflow as tf
import os

# Check if GPU is available and TensorFlow is using it
if tf.test.is_gpu_available():
    print('TensorFlow is using GPU')
else:
    print('TensorFlow is not using GPU')

1.5.3
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
TensorFlow is using GPU


In [2]:

env_="local"
PATH = "/kaggle/input/"
PATH="../data/"
if env_=="COLAB_INIT":
  from google.colab import drive
  drive.mount('/content/drive')
  ! pip install kaggle
  ! mkdir ~/.kaggle
  os.getcwd()
  ! pip install Bio


MAIN_DIR = f"{PATH}cafa-5-protein-function-prediction"

class config:
    train_sequences_path = MAIN_DIR  + "/Train/train_sequences.fasta"
    train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
    test_sequences_path = MAIN_DIR + "/Test (Targets)/testsuperset.fasta"
    
    num_labels = 1500
    n_epochs = 5
    batch_size = 128
    lr = 0.001

    env = env_

    
TRAIN = True
embeddings_source="T5"

PATH_DATAFRAMES="../data/data_train"
if env_=="kaggle":
    PATH_DATAFRAMES="/kaggle/input/dataframes-train-cafa/data_train"
BUILD_DATAFRAMES= True
if os.path.isfile( os.path.join(PATH_DATAFRAMES,f"X_train_{embeddings_source}_{config.num_labels}.npy")):
    BUILD_DATAFRAMES = False
BUILD_DATAFRAMES

True

In [3]:
if env_=="COLAB_INIT":
  ! cp kaggle.json ~/.kaggle/
  ! chmod 600 ~/.kaggle/kaggle.json

if env_=="COLAB_INIT" and not os.path.isdir("cafa-5-protein-function-prediction"):
  ! kaggle competitions download cafa-5-protein-function-prediction

  ! unzip cafa-5-protein-function-prediction.zip -d cafa-5-protein-function-prediction

  ! kaggle datasets download train-targets-top

  ! unzip train-targets-top.zip -d train-targets-top
  
  ! kaggle datasets download my-t5embeds

  ! unzip my-t5embeds.zip -d my-t5embeds
  
  ! kaggle datasets download dataframes-train-cafa

  ! unzip dataframes-train-cafa.zip -d dataframes-train-cafa

In [4]:
import pandas as pd

sub = pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/sample_submission.tsv", sep= "\t", header = None)
sub.columns = ["The Protein ID", "The Gene Ontology term (GO) ID", "Predicted link probability that GO appear in Protein"]
sub.head(5)

Unnamed: 0,The Protein ID,The Gene Ontology term (GO) ID,Predicted link probability that GO appear in Protein
0,A0A0A0MRZ7,GO:0000001,0.123
1,A0A0A0MRZ7,GO:0000002,0.123
2,A0A0A0MRZ8,GO:0000001,0.123
3,A0A0A0MRZ8,GO:0000002,0.123
4,A0A0A0MRZ9,GO:0000001,0.123


In [5]:
from Bio import SeqIO
print("Loading train set ProtBERT Embeddings...")
fasta_train = list(SeqIO.parse(config.train_sequences_path, "fasta"))
print("Total Nb of Elements : ", len(list(fasta_train)))
fasta_test= list(SeqIO.parse(config.test_sequences_path, "fasta"))
print("Total Nb of Elements : ", len(list(fasta_test)))

Loading train set ProtBERT Embeddings...
Total Nb of Elements :  142246
Total Nb of Elements :  141865


In [6]:
seq_train=[]
ids=[]
for seq in tqdm(fasta_train ,total=len(fasta_train )):
    seq_train.append( str(seq.seq))
    ids.append(seq.id)


100%|██████████| 142246/142246 [00:00<00:00, 587516.46it/s]


In [7]:
len(seq_train)

142246

In [8]:
seq_test=[]
ids_test=[]
for seq in tqdm(fasta_test ,total=len(fasta_test )):
    seq_test.append( str(seq.seq))
    ids_test.append(seq.id)


100%|██████████| 141865/141865 [00:00<00:00, 527377.21it/s]


In [9]:
ids[-1],ids_test[-1]

('A0A8I6GHU0', 'A0A3G2FQK2')

In [10]:
len(ids),len(ids_test)


(142246, 141865)

In [11]:
SAVE_SEQUENCES_UNIQUE=False
if SAVE_SEQUENCES_UNIQUE:
    seq_train_unique=[]
    indices_unique=[]
    for i,element in tqdm(enumerate(seq_train),total=len(seq_train)):
        if element not in seq_train_unique:
            seq_train_unique.append(element)
            indices_unique.append(i)
    np.save("../data/proteins/indices_unique_train.npy",np.array(indices_unique))

   

In [12]:
ids=[seq.id for seq in fasta_train]

In [13]:
ids_test=[seq.id for seq in fasta_test]

In [14]:
len(set(ids_test).intersection(set(ids)))


73653

In [15]:
len(ids_test)

141865

In [16]:
indices_unique=np.load("../data/proteins/indices_unique_train.npy")

In [17]:
ids_unique=[id for i,id in enumerate(ids) if i in indices_unique]

In [18]:
len(ids),len(indices_unique)

(142246, 138924)

### BUILD STRATIFY TRAIN AND TEST DATAFRAMES

### Annalyze train sequences

In [19]:
# Directories for the different embedding vectors : 
embeds_map = {
    "T5" : "my-t5embeds",
    "ProtBERT" : "protbert-embeddings-for-cafa5",
    "EMS2" : "cafa-5-ems-2-embeddings-numpy"
}

# Length of the different embedding vectors :
embeds_dim = {
    "T5" : 1024,
    "ProtBERT" : 1024,
    "EMS2" : 1280
}

def get_dataset( datatype, embeddings_source):
        if embeddings_source in ["ProtBERT", "EMS2"]:
            embeds = np.load(f"{PATH}" + embeds_map[embeddings_source] + "/" + datatype + "_embeddings.npy")
            ids = np.load(f"{PATH}" + embeds_map[embeddings_source] + "/" + datatype + "_ids.npy")
        
        if embeddings_source == "T5":
            embeds = np.load(f"{PATH}"  + embeds_map[embeddings_source] + "/" + datatype + "_embeds.npy")
            ids = np.load(f"{PATH}"  + embeds_map[embeddings_source] + "/" + datatype + "_ids.npy")
            
        return embeds,ids


In [20]:
embeds={}
for su in ["T5", "EMS2","ProtBERT" ]:
    embeds[su]={}
    embeds_,ids=get_dataset( "train", su)
    embeds[su]["ids"]=ids 
    embeds[su]["sequences"]=embeds_


In [21]:
embeds_f={}
for su in ["T5","ProtBERT", "EMS2" ]:
    ids_=embeds[su]["ids"]
    print(ids_[:5])
    dic={id_:e for e,id_ in enumerate(ids_)}
    indices=[dic[id_] for id_ in ids_unique]
    print(indices[:5])
    print(ids_unique[:5])
    embeds_f[su]={}
    embeds_f[su]["ids"]=[embeds[su]["ids"][i] for i in indices]
    embeds_f[su]["sequences"]=[embeds[su]["sequences"][i] for i in indices]
    print(len(embeds_f[su]["sequences"]))
    #np.save("../data/proteins/ids.npy",embeds_f[su]["ids"])
    #np.save(f"../data/proteins/embeds_{su}.npy",embeds_f[su]["sequences"])


['P20536' 'O73864' 'O95231' 'A0A0B4J1F4' 'P54366']
[0, 1, 2, 3, 4]
['P20536', 'O73864', 'O95231', 'A0A0B4J1F4', 'P54366']
138924
['P20536' 'O73864' 'O95231' 'A0A0B4J1F4' 'P54366']
[0, 1, 2, 3, 4]
['P20536', 'O73864', 'O95231', 'A0A0B4J1F4', 'P54366']
138924
['Q9ZSA8' 'P25353' 'A0A2R8YCW8' 'G3V5N8' 'A0A140LFN4']
[94986, 67183, 94495, 41576, 4859]
['P20536', 'O73864', 'O95231', 'A0A0B4J1F4', 'P54366']
138924


In [22]:
labels = pd.read_csv(config.train_labels_path, sep = "\t")
NUM_LABELS = 1500
top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
labels_names = top_terms[:NUM_LABELS].index.values
labels_names 

weights=pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/IA.txt",sep="\t",header=None,index_col=0).rename(columns={1:"weight"})
labels=pd.merge(labels, weights.reset_index(names="term"), on ="term")
labels["top"]=labels.term.isin(labels_names)
labels.loc[labels.EntryID== 'Q9T1W1']

Unnamed: 0,EntryID,term,aspect,weight,top
3939559,Q9T1W1,GO:0005575,CCO,0.0,True
4645494,Q9T1W1,GO:0019028,CCO,1.276654,False
4645783,Q9T1W1,GO:0044423,CCO,8.284424,False
4679946,Q9T1W1,GO:0046727,CCO,4.357552,False
4679949,Q9T1W1,GO:0098017,CCO,0.584963,False


In [23]:
NUM_LABELS = 3000

In [None]:

print("GENERATE TARGETS FOR ENTRY IDS ("+str(NUM_LABELS)+" MOST COMMON GO TERMS)")
ids = ids_unique
labels = pd.read_csv(config.train_labels_path, sep = "\t")
print(labels.shape)
weights=pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/IA.txt",sep="\t",header=None,index_col=0).rename(columns={1:"weight"})
labels=pd.merge(labels, weights.reset_index(names="term"), on ="term")
labels=labels.loc[labels.weight>0]
print(labels.shape)
print(f"Config labels {NUM_LABELS}")
top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
labels_names = top_terms[:NUM_LABELS].index.values
train_labels_sub = labels[(labels.term.isin(labels_names)) & (labels.EntryID.isin(ids))]
id_labels = train_labels_sub.groupby('EntryID')['term'].apply(list).to_dict()

go_terms_map = {label: i for i, label in enumerate(labels_names)}
labels_matrix = np.empty((len(ids), len(labels_names)))

for index, id in tqdm(enumerate(ids)):
    if id in id_labels:
        id_gos_list = id_labels[id]
        temp = [go_terms_map[go] for go in labels_names if go in id_gos_list]
        labels_matrix[index, temp] = 1
    else:
         labels_matrix[index,:] = 0


    
labels_df=pd.DataFrame(labels_matrix,columns=labels_names,index=ids)

labels_df.to_pickle(f"../data/data_train/labels_{NUM_LABELS}.pkl" )
#np.save("/kaggle/working/train_targets_top"+str(config.num_labels)+".npy", np.array(labels_matrix))
print("GENERATION FINISHED!")

In [2]:
import pandas as pd

In [3]:
blasp =pd.read_csv("blasp_submision_fold.csv")

In [5]:
blasp.to_csv("blasp_submision_fold.tsv",sep="\t",index=False, header=False)

## Generate labels by importances


In [24]:

print("GENERATE TARGETS FOR ENTRY IDS ("+str(NUM_LABELS)+" MOST COMMON GO TERMS)")
ids = ids_unique
labels = pd.read_csv(config.train_labels_path, sep = "\t")
print(labels.shape)
weights_imp=pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/IA.txt",sep="\t",header=None,).rename(columns={0:"term",1:"weight"})
labels = pd.merge(labels,weights_imp, on ="term")

labels_=labels[["term","aspect","weight"]].drop_duplicates()
labels_=pd.merge(labels_,labels.groupby(["term"]).size().reset_index().rename(columns={0:"count"}),on="term")
labels_["importance"]=labels_["weight"]*labels_["count"]
labels_=labels_.sort_values(by="importance",ascending=False)
display(labels_.sort_values(by="count",ascending=False))
display(labels_.sort_values(by="weight",ascending=False))
labels_.head(10)

GENERATE TARGETS FOR ENTRY IDS (3000 MOST COMMON GO TERMS)
(5363863, 3)


Unnamed: 0,term,aspect,weight,count,importance
21304,GO:0005575,CCO,0.000000,92912,0.000000
8,GO:0008150,BPO,0.000000,92210,0.000000
21309,GO:0110165,CCO,0.025471,91286,2325.149641
24242,GO:0003674,MFO,0.000000,78637,0.000000
21287,GO:0005622,CCO,0.366945,70785,25974.228022
...,...,...,...,...,...
17422,GO:0046000,BPO,0.000000,1,0.000000
20220,GO:2000874,BPO,4.643856,1,4.643856
17410,GO:0032938,BPO,0.000000,1,0.000000
9386,GO:0097736,BPO,4.643856,1,4.643856


Unnamed: 0,term,aspect,weight,count,importance
11052,GO:0044848,BPO,15.492651,1,15.492651
24110,GO:0120212,CCO,15.478122,1,15.478122
24116,GO:0031912,CCO,15.478122,1,15.478122
23410,GO:1990015,CCO,14.893159,2,29.786319
29980,GO:1904483,MFO,14.808285,1,14.808285
...,...,...,...,...,...
16754,GO:1901750,BPO,0.000000,3,0.000000
16752,GO:0072656,BPO,0.000000,5,0.000000
16747,GO:0031117,BPO,0.000000,12,0.000000
16743,GO:0060599,BPO,0.000000,2,0.000000


Unnamed: 0,term,aspect,weight,count,importance
173,GO:0050896,BPO,1.568071,31098,48763.861747
0,GO:0008152,BPO,1.598544,30448,48672.468143
82,GO:0032501,BPO,1.65527,29274,48456.364953
69,GO:0032502,BPO,1.684844,28680,48321.312613
46,GO:0065007,BPO,1.153288,41457,47811.8748
21316,GO:0016020,CCO,1.824773,25768,47020.747337
21315,GO:0071944,CCO,2.157039,20467,44148.123047
24247,GO:0003824,MFO,1.634664,25324,41396.240279
21293,GO:0032991,CCO,2.479665,16657,41303.784386
21314,GO:0031974,CCO,2.539921,15696,39866.593187


In [None]:
weights_imp=pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/IA.txt",sep="\t",header=None,).rename(columns={0:"term",1:"weight"})

In [None]:
labels.groupby(["term"]).size().sort_values()

In [None]:
labels.loc[labels.term=="GO:0005575"]

In [None]:
labels_.to_csv("../data/auxiliar_data/label_importances.csv",index=False)

In [None]:
NUM_LABELS=2500

In [None]:
len(ids)

In [25]:
labels_

Unnamed: 0,term,aspect,weight,count,importance
173,GO:0050896,BPO,1.568071,31098,48763.861747
0,GO:0008152,BPO,1.598544,30448,48672.468143
82,GO:0032501,BPO,1.655270,29274,48456.364953
69,GO:0032502,BPO,1.684844,28680,48321.312613
46,GO:0065007,BPO,1.153288,41457,47811.874800
...,...,...,...,...,...
20991,GO:0043641,BPO,0.000000,13,0.000000
20995,GO:0071366,BPO,0.000000,2,0.000000
20996,GO:0019567,BPO,0.000000,2,0.000000
20998,GO:0010253,BPO,0.000000,4,0.000000


In [None]:
labels_

In [None]:
labels.loc[labels.term=="GO:0005575"]

In [None]:
labels_

In [None]:
def add_important_labels(ids,file_suffix=""):
   
    print(labels.shape)
    print(f"Config labels {NUM_LABELS}")
    labels_names = labels_.iloc[:NUM_LABELS].term.values
    train_labels_sub = labels[(labels.term.isin(labels_names)) & (labels.EntryID.isin(ids))]
    id_labels = train_labels_sub.groupby('EntryID')['term'].apply(list).to_dict()

    go_terms_map = {label: i for i, label in enumerate(labels_names)}
    labels_matrix = np.empty((len(ids), len(labels_names)))

    for index, id in tqdm(enumerate(ids)):
        if id in id_labels:
            id_gos_list = id_labels[id]
            temp = [go_terms_map[go] for go in labels_names if go in id_gos_list]
            labels_matrix[index, temp] = 1
        else:
            labels_matrix[index,:] = 0


        
    labels_df=pd.DataFrame(labels_matrix,columns=labels_names,index=ids)

    labels_df.to_pickle(f"../data/proteins/labels_{NUM_LABELS}.pkl" )
    #np.save("/kaggle/working/train_targets_top"+str(config.num_labels)+".npy", np.array(labels_matrix))
    print("GENERATION FINISHED!")

In [None]:
add_important_labels(ids,file_suffix="")

In [None]:
labels___=pd.read_pickle("../data/proteins/labels_2500.pkl")

In [None]:
labels___.sum(axis=0)

In [26]:
NUM_LABELS_dict= {"BPO": 5000,
                  "CCO":2000,
                "MFO": 3000}

In [28]:
len(ids)

138924

### by aspects

In [None]:
labels_

In [29]:

RANGE_ = 1500
for aspect  in ["BPO", "CCO", "MFO"]:  
    NUM_LABELS =  NUM_LABELS_dict[aspect]
    print("GENERATE TARGETS FOR ENTRY IDS ("+str(NUM_LABELS)+" MOST COMMON GO TERMS)")
    ids = ids_unique
    labels = pd.read_csv(config.train_labels_path, sep = "\t")
    print(labels.shape)
    weights=pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/IA.txt",sep="\t",header=None,index_col=0).rename(columns={1:"weight"})
    labels=pd.merge(labels, weights.reset_index(names="term"), on ="term")
    labels=labels.loc[(labels.weight>0)&(labels.aspect==aspect)]
    print(labels.shape)
    print(f"Config labels {NUM_LABELS}")
    top_terms = labels_.loc[labels_.aspect==aspect]
    labels_names = top_terms[:NUM_LABELS].term.values
    train_labels_sub = labels[(labels.term.isin(labels_names)) & (labels.EntryID.isin(ids))]
    id_labels = train_labels_sub.groupby('EntryID')['term'].apply(list).to_dict()

    go_terms_map = {label: i for i, label in enumerate(labels_names)}
    labels_matrix = np.empty((len(ids), len(labels_names)))

    for index, id in tqdm(enumerate(ids)):
        if id in id_labels:
            id_gos_list = id_labels[id]
            temp = [go_terms_map[go] for go in labels_names if go in id_gos_list]
            labels_matrix[index, temp] = 1
        else:
            labels_matrix[index,:] = 0


        
    labels_df=pd.DataFrame(labels_matrix,columns=labels_names,index=ids)

    labels_df.to_pickle(f"../data/proteins/labels_{NUM_LABELS}_{aspect}_importance.pkl" )
    #np.save("/kaggle/working/train_targets_top"+str(config.num_labels)+".npy", np.array(labels_matrix))
    print("GENERATION FINISHED!")

GENERATE TARGETS FOR ENTRY IDS (5000 MOST COMMON GO TERMS)
(5363863, 3)
(3226048, 4)
Config labels 5000


138924it [05:20, 434.02it/s]


GENERATION FINISHED!
GENERATE TARGETS FOR ENTRY IDS (2000 MOST COMMON GO TERMS)
(5363863, 3)
(1067313, 4)
Config labels 2000


138924it [00:38, 3596.09it/s]


GENERATION FINISHED!
GENERATE TARGETS FOR ENTRY IDS (3000 MOST COMMON GO TERMS)
(5363863, 3)
(567995, 4)
Config labels 3000


138924it [00:33, 4187.23it/s]


GENERATION FINISHED!


In [None]:
labels_df.shape

## Work with labels

In [None]:
labels_=labels[["term","aspect"]].drop_duplicates()
display(labels_.groupby("aspect").size()/len(labels_))
labels_selected=labels_.loc[labels_.term.isin(labels_names)]
display(labels_selected.groupby("aspect").size()/len(labels_selected))
display(labels_selected.groupby("aspect").size())

In [None]:
from sklearn.model_selection import train_test_split
labels_df=pd.read_pickle(f"data/data_train/labels_{1000}.pkl" )
ids_train,ids_test=train_test_split(labels_df.index,random_state=2,test_size=0.25)
len(ids_train),len(ids_test)

In [None]:
for N_LABELS in [1000, 1500, 2000]:
    labels_df  =pd.read_pickle(f"data/data_train/labels_{N_LABELS}.pkl" )
    labels_df.loc[ids_train].to_pickle(f"data/data_train/labels_train_{N_LABELS}.pkl" )
    labels_df.loc[ids_test].to_pickle(f"data/data_train/labels_test_{N_LABELS}.pkl" )

In [None]:
for aspect  in ["BPO", "CCO", "MFO"]:  

    NUM_LABELS =  NUM_LABELS_dict[aspect]
    print("GENERATE TARGETS FOR ENTRY IDS ("+str(NUM_LABELS)+" MOST COMMON GO TERMS)")
    ids = ids_unique
    labels = pd.read_csv(config.train_labels_path, sep = "\t")
    print(labels.shape)
    weights=pd.read_csv(f"{PATH}cafa-5-protein-function-prediction/IA.txt",sep="\t",header=None,index_col=0).rename(columns={1:"weight"})
    labels=pd.merge(labels, weights.reset_index(names="term"), on ="term")
    labels=labels.loc[(labels.weight>0)&(labels.aspect==aspect)]
    print(labels.shape)
    print(f"Config labels {NUM_LABELS}")
    top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms[:NUM_LABELS].index.values
    train_labels_sub = labels[(labels.term.isin(labels_names)) & (labels.EntryID.isin(ids))]
    id_labels = train_labels_sub.groupby('EntryID')['term'].apply(list).to_dict()

    go_terms_map = {label: i for i, label in enumerate(labels_names)}
    labels_matrix = np.empty((len(ids), len(labels_names)))

    for index, id in tqdm(enumerate(ids)):
        if id in id_labels:
            id_gos_list = id_labels[id]
            temp = [go_terms_map[go] for go in labels_names if go in id_gos_list]
            labels_matrix[index, temp] = 1
        else:
            labels_matrix[index,:] = 0


        
    labels_df=pd.DataFrame(labels_matrix,columns=labels_names,index=ids)

    labels_df.to_pickle(f"data/data_train/labels_{NUM_LABELS}_{aspect}.pkl" )
    #np.save("/kaggle/working/train_targets_top"+str(config.num_labels)+".npy", np.array(labels_matrix))
    print("GENERATION FINISHED!")

In [None]:
for aspect  in ["BPO", "CCO", "MFO"]:  
    NUM_LABELS =  NUM_LABELS_dict[aspect]
    for N_LABELS in [ NUM_LABELS ]:
        labels_df  =pd.read_pickle(f"data/data_train/labels_{N_LABELS}_{aspect}.pkl" )
        labels_df.loc[ids_train].to_pickle(f"data/data_train/labels_train_{N_LABELS}_{aspect}.pkl" )
        labels_df.loc[ids_test].to_pickle(f"data/data_train/labels_test_{N_LABELS}_{aspect}.pkl" )

In [None]:
for su in ["T5", "EMS2","ProtBERT" ]:
        matrix=np.load(f"data/data_train/embeds_{su}.npy")
        df_=pd.DataFrame(matrix,index=ids)
        df_train=df_.loc[ids_train]
        df_test=df_.loc[ids_test]
        print(df_train.shape,df_test.shape)
        np.save(f"data/data_train/embeds_train_{su}.npy",df_train.values)
        np.save(f"data/data_train/embeds_test_{su}.npy",df_test.values)

In [None]:
dic_ids={e:i for i,e in enumerate(ids)}
ind_train=[dic_ids[e] for e in ids_train]
ind_test=[dic_ids[e] for e in ids_test]

In [None]:
len(ind_train),len(ind_test)

In [None]:
for su1,su2 in [("T5", "EMS2"),("T5","ProtBERT")]:
      matrix1=np.load(f"data/data_train/embeds_{su1}.npy")
      matrix2=np.load(f"data/data_train/embeds_{su2}.npy")
      if matrix2.shape[1]>matrix1.shape[1]:
            matrix1=np.pad(matrix1,((0,0),(0,matrix2.shape[1]-matrix1.shape[1])))
      matrix=np.concatenate([np.expand_dims(matrix1,-1),np.expand_dims(matrix2,-1)],axis=-1)
      print(su1,su2,matrix.shape)
      np.save(f"data/data_train/embeds_{su1}-{su2}.npy",matrix)
      np.save(f"data/data_train/embeds_train_{su1}-{su2}.npy",matrix[ind_train])
      np.save(f"data/data_train/embeds_test_{su1}-{su2}.npy",matrix[ind_test])


In [None]:
matrix1=np.load(f"data/data_train/embeds_{'T5'}.npy")

In [None]:
labels.groupby("aspect").size()

In [None]:
labels

In [None]:
for aspect,df_  in labels.groupby("aspect"):
    x=df_.groupby("term")["weight"].agg(["count","sum"]).sort_values(by="count")
    selected=x.iloc[-1500:]
    display(selected.loc[selected["sum"]==0].tail(50))
    


## DATA FROM T5

### SHORT SEQUENCES T5

## CHECK TRAIN

In [None]:
embeds_t5 = pd.read_pickle("../data/t5/t5_embeddings_train_embeds.pkl")
ids_t5 = pd.read_pickle("../data/t5/t5_embeddings_train_ids.npy")

In [None]:
indices_unique

In [None]:
seq_train_unique = [(seq_train)[i] for i in  indices_unique]

In [None]:
len(seq_train_unique)

In [None]:
ids_largos = set(ids_unique).difference(set(ids_t5 ))
ids_cortos = set(ids_unique).intersection(set(ids_t5 ))
dic_indices_t5={e: i for  i,e in enumerate(ids_t5)}
dic_indices={e: i for  i,e in enumerate(ids_unique)}
indices_prev=[dic_indices[e] for e in ids_t5 if e in dic_indices]
indices_t5=[dic_indices_t5[e] for e in ids_t5 if e in dic_indices]

In [None]:
indices_largos=[dic_indices[e] for e in ids_largos]

In [None]:
len(indices_t5),len(indices_prev)

In [None]:
arr_t5  = np.stack([embeds_t5[i][1] for i in indices_t5])
arr_t5_prev  = np.stack([embeds_f['T5']['sequences'][i] for i in indices_prev])
(arr_t5-arr_t5_prev).max()

In [None]:
for i in indices_largos:
    print(len(seq_train_unique[i]))

In [None]:
ids_t5_safe=np.stack([ids_t5[e] for e in indices_t5])
embeds_t5_safe=np.stack([embeds_t5[e] for e in indices_t5])

In [None]:
len(ids_t5_safe),len(embeds_t5_safe)

In [None]:
embeds_t5_safe.shape

In [None]:
np.save("../data/data_train/ids_t5_cortos_shape4.npy",ids_t5_safe)
np.save("../data/data_train/embeds_t5_cortos_shape4.npy",embeds_t5_safe)

In [None]:
np.save("../data/auxiliar_data/indices_largos_train.npy",np.array(indices_largos))

### ADD TRAIN LABELS

In [None]:
add_important_labels(ids_t5_safe,file_suffix="_cortos_shape4")

## CHECK TEST

In [None]:
embeds_t5 = pd.read_pickle("../data/t5/t5_embeddings_test_embeds.pkl")
ids_t5 = pd.read_pickle("../data/t5/t5_embeddings_test_ids.npy")

In [None]:
embeds_prev = np.load("../data/my-t5embeds/test_embeds.npy")
ids_prev = np.load("../data/my-t5embeds/test_ids.npy")

In [None]:
len(embeds_t5 ), len(embeds_prev)

In [None]:
ids_largos = set(ids_prev).difference(set(ids_t5 ))
ids_cortos = set(ids_prev).intersection(set(ids_t5 ))
dic_indices_t5={e: i for  i,e in enumerate(ids_t5)}
dic_indices={e: i for  i,e in enumerate(ids_prev )}
indices_prev=[dic_indices[e] for e in ids_t5 if e in dic_indices]
indices_t5=[dic_indices_t5[e] for e in ids_t5 if e in dic_indices]
indices_largos=[dic_indices[e] for e in ids_largos]


In [None]:
len(indices_t5),len(indices_prev)

In [None]:
arr_t5  = np.stack([embeds_t5[i][1] for i in indices_t5])
arr_t5_prev  = np.stack([embeds_prev[i] for i in indices_prev])
(arr_t5-arr_t5_prev).max()

In [None]:
np.save("../data/auxiliar_data/indices_largos_test.npy",np.array(indices_largos))

In [None]:
for i in indices_largos:
    print(len(seq_test[i]))

## JOIN DATA

In [8]:
labels=pd.read_csv(f'../data/cafa-5-protein-function-prediction/Train/train_terms.tsv',sep="\t")

In [2]:
import pandas as pd

In [8]:
preds_t=None
for iter_ in range(1,8):
    preds=pd.read_csv(f"../results/preds_blasp10n/blasp_10n_{iter_}/preds.csv",index_col=0)
    if preds_t is None:
        preds_t=preds
    else:
        preds_t=pd.concat([preds_t,preds])


In [12]:
  preds_t.to_csv("submission.tsv",sep="\t",header=False,index=False)

In [14]:
  preds_=pd.read_csv("submission.tsv",sep="\t",header=None)

In [15]:
  preds_

Unnamed: 0,0,1,2
0,Q9CQV8,GO:0009987,0.7
1,Q9CQV8,GO:0051641,0.6
2,Q9CQV8,GO:0051179,0.7
3,Q9CQV8,GO:0033036,0.5
4,Q9CQV8,GO:0070727,0.5
...,...,...,...
19808047,A0A3G2FQK2,GO:0016049,0.4
19808048,A0A3G2FQK2,GO:0048675,0.4
19808049,A0A3G2FQK2,GO:1990138,0.4
19808050,A0A3G2FQK2,GO:0008593,0.4


In [10]:

for fold in range(0,5):
    preds=pd.read_csv(f"../results/preds_blasp10n/predsblasp_10n_fold{fold}/preds.csv",index_col=0)
    preds.to_csv(f"../results/preds_blasp10n/preds_fold{fold}.tsv",header=False,sep="\t",index=False)
    ids=preds.EntryID.unique()
    labels_=labels.loc[labels.EntryID.isin( ids)]
    labels_.to_csv(f"../results/preds_blasp10n/labels_used_fold{fold}.tsv",sep="\t",index=False)
    display(preds)

Unnamed: 0,EntryID,term,probability
0,O73864,GO:0009987,0.8
1,O73864,GO:0048869,0.6
2,O73864,GO:0048856,0.8
3,O73864,GO:0065007,0.6
4,O73864,GO:0007275,0.8
...,...,...,...
3704466,Q9SZA7,GO:0080134,0.2
3704467,Q9SZA7,GO:0080135,0.2
3704468,Q9SZA7,GO:0071944,0.1
3704469,Q9SZA7,GO:0016020,0.1


Unnamed: 0,EntryID,term,probability
0,P06731,GO:0110165,2.400000
1,P06731,GO:0071944,1.700000
2,P06731,GO:0016020,1.700000
3,P06731,GO:0005886,1.700000
4,P06731,GO:0009987,1.500000
...,...,...,...
3740803,Q5TAP6,GO:0042995,0.333333
3740804,Q5TAP6,GO:0120025,0.333333
3740805,Q5TAP6,GO:0005929,0.333333
3740806,Q5TAP6,GO:0032838,0.333333


Unnamed: 0,EntryID,term,probability
0,O43854,GO:0009987,0.1
1,O43854,GO:0048869,0.1
2,O43854,GO:0048856,0.1
3,O43854,GO:0065007,0.8
4,O43854,GO:0007275,0.1
...,...,...,...
3780566,B7ZVK1,GO:0010154,0.1
3780567,B7ZVK1,GO:0009791,0.1
3780568,B7ZVK1,GO:0048316,0.1
3780569,B7ZVK1,GO:0009733,0.1


Unnamed: 0,EntryID,term,probability
0,F4ILE1,GO:0008152,0.1
1,F4ILE1,GO:0044237,0.1
2,F4ILE1,GO:1901360,0.1
3,F4ILE1,GO:0006139,0.1
4,F4ILE1,GO:0046483,0.1
...,...,...,...
4831308,F4K6M8,GO:0019953,0.1
4831309,F4K6M8,GO:0007338,0.1
4831310,F4K6M8,GO:0000003,0.1
4831311,F4K6M8,GO:0009566,0.1


Unnamed: 0,EntryID,term,probability
0,E7F6Z8,GO:0048856,0.1
1,E7F6Z8,GO:0007275,0.1
2,E7F6Z8,GO:0048731,0.1
3,E7F6Z8,GO:0032502,0.1
4,E7F6Z8,GO:0032501,0.3
...,...,...,...
4779744,A0A8I6GHU0,GO:0030097,0.1
4779745,A0A8I6GHU0,GO:0002521,0.1
4779746,A0A8I6GHU0,GO:1903131,0.1
4779747,A0A8I6GHU0,GO:0030098,0.1


In [None]:
 preds=pd.read_csv(f"../results/preds_blasp/preds_fold{fold}.tsv",header=None,sep="\t")
 preds