# Combining datasets

### Here we work towards a method to combine datasets and outputs predictions

#### We will use: SPROF Predictions, ProFun (https://github.com/SamusRam/ProFun) Predictions and QuickGo Annotations

In [1]:
#Create a dictionary to assign each go term to the roots (CCO, MFO, BPO)

import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib
import pickle
import statistics
from tqdm import tqdm
from Bio import SeqIO
import gc

def extract_go_terms_and_branches(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        # Match each stanza with [Term] in the OBO file
        stanzas = re.findall(r'\[Term\][\s\S]*?(?=\n\[|$)', content)

    go_terms_dict = {}
    for stanza in stanzas:
        # Extract the GO term ID
        go_id = re.search(r'^id: (GO:\d+)', stanza, re.MULTILINE)
        if go_id:
            go_id = go_id.group(1)

        # Extract the namespace (branch)
        namespace = re.search(r'^namespace: (\w+)', stanza, re.MULTILINE)
        if namespace:
            namespace = namespace.group(1)

        if go_id and namespace:
            # Map the branch abbreviation to the corresponding BPO, CCO, or MFO
            branch_abbr = {'biological_process': 'BPO', 'cellular_component': 'CCO', 'molecular_function': 'MFO'}
            go_terms_dict[go_id] = branch_abbr[namespace]

    return go_terms_dict

file_path = '/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo'
go_terms_dict = extract_go_terms_and_branches(file_path)

In [2]:
class ProteinPredictions:
    # Initialize an empty dictionary to store the predictions
    def __init__(self):
        self.predictions = {}
        self.predictions_n={}

    # Add a prediction to the storage, with optional bonus
    # Arguments:
    #   - protein: Identifier for the protein
    #   - go_term: GO term that is being predicted
    #   - score: Confidence score of the prediction
    #   - branch: Branch of the Gene Ontology (e.g., 'CCO', 'MFO', 'BPO')
    #   - bonus: Optional bonus to be added to the score
    def add_prediction(self, protein, go_term, score, branch):
        # If the protein is not already in the storage, initialize its structure
        if protein not in self.predictions:
            self.predictions[protein] = {'CCO': {}, 'MFO': {}, 'BPO': {}}  
            self.predictions_n[protein] =  {'CCO': {}, 'MFO': {}, 'BPO': {}}  
        # Convert the score to a float for comparison and calculation
        score = float(score)
        # If this GO term has already been predicted for this protein and branch,
        # add the bonus to the score. Keep the highest score.
        if go_term in self.predictions[protein][branch]:
            n= self.predictions_n[protein][branch][go_term]
            self.predictions[protein][branch][go_term]=score/(n+1)+       self.predictions[protein][branch][go_term]*n/(n+1)
            self.predictions_n[protein][branch][go_term]+=1
        # If this GO term has not been predicted yet, store it with the score
        else:
            self.predictions_n[protein][branch][go_term]=1
            self.predictions[protein][branch][go_term] = score
            
            
    def get_predictions(self, output_file='submission.tsv', top=90):
        # Open the output file
        with open(output_file, 'w') as f:
            # Iterate through each protein and its branches
            for protein, branches in self.predictions.items():
                # For each branch, sort the GO terms by score in descending order and select the top ones
                for branch, go_terms in branches.items():
                    # Sort go_terms by score in descending order and take the top ones
                    top_go_terms = sorted(go_terms.items(), key=lambda x: x[1], reverse=True)[:top]
                    # Write each of the top predictions to the file
                    for go_term, score in top_go_terms:
                        f.write(f"{protein}\t{go_term}\t{score:.3f}\n")
   


In [3]:
protein_predictions = ProteinPredictions()

In [4]:
no_add =set ()

In [5]:
for l in tqdm(open('/kaggle/input/submisions-last-try/my_quickgo_submission.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        
        root = go_terms_dict[go]
        no_add.add( (go,temp_id))
        #branch = item_list[3].strip()
        protein_predictions.add_prediction(temp_id, go, score, root)

1602058it [00:09, 171405.73it/s]


In [6]:
for l in tqdm(open('/kaggle/input/submisions-last-try/preds_ens2_1000/submission.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

36765561it [02:52, 213607.81it/s]


In [7]:
for l in tqdm(open('/kaggle/input/submisions-last-try/preds_ens2_2500/submission.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict :
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

46652926it [04:07, 188295.84it/s]


In [8]:
for l in tqdm(open('/kaggle/input/submision-notruncate001/submission_count_no001. tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

38303280it [04:14, 150449.73it/s]


In [9]:
for l in tqdm(open("/kaggle/input/submision-notruncate001/submission_importance_v2_no001. tsv")):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

38303280it [05:11, 122808.76it/s]


In [10]:
for l in tqdm(open("/kaggle/input/submisions-last-try/preds_t5_nomraloss_count/submission.tsv")):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

40589174it [03:59, 169418.73it/s]


In [11]:
for l in tqdm(open("/kaggle/input/submisions-last-try/preds_t5_normaloss_importamce/submission.tsv")):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

40986374it [03:49, 178615.09it/s]


In [12]:
for l in tqdm(open("/kaggle/input/submisions-last-try/submision_allLabels.csv")):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
        #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

52437240it [05:00, 174337.08it/s]


In [13]:
for l in tqdm(open("/kaggle/input/submisions-last-try/blasp_unique/submision.tsv")):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

11880399it [01:15, 158057.51it/s]


In [14]:
for l in tqdm(open("/kaggle/input/sprof-predictions/submission.tsv")):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        if (go,temp_id) not in  no_add:
            #branch = item_list[3].strip()
            protein_predictions.add_prediction(temp_id, go, score, root)

4032720it [00:22, 176524.01it/s]


In [15]:
protein_predictions.get_predictions()

In [16]:
!head -n 200 'submission.tsv'

A2AIG8	GO:0005575	1.000
A2AIG8	GO:0110165	0.554
A2AIG8	GO:0005622	0.505
A2AIG8	GO:0005737	0.471
A2AIG8	GO:0043226	0.307
A2AIG8	GO:0043227	0.302
A2AIG8	GO:0043229	0.277
A2AIG8	GO:0043231	0.274
A2AIG8	GO:0005829	0.235
A2AIG8	GO:0005634	0.104
A2AIG8	GO:0005739	0.098
A2AIG8	GO:0016020	0.073
A2AIG8	GO:0031974	0.071
A2AIG8	GO:0070013	0.068
A2AIG8	GO:0043233	0.068
A2AIG8	GO:0012505	0.061
A2AIG8	GO:0005654	0.042
A2AIG8	GO:0031981	0.042
A2AIG8	GO:0005576	0.040
A2AIG8	GO:0005615	0.040
A2AIG8	GO:0032991	0.039
A2AIG8	GO:0031982	0.038
A2AIG8	GO:0071944	0.038
A2AIG8	GO:0005783	0.035
A2AIG8	GO:0031090	0.032
A2AIG8	GO:0005886	0.031
A2AIG8	GO:0043232	0.027
A2AIG8	GO:0065010	0.023
A2AIG8	GO:0043230	0.022
A2AIG8	GO:0043228	0.022
A2AIG8	GO:0005635	0.021
A2AIG8	GO:0070062	0.019
A2AIG8	GO:1903561	0.019
A2AIG8	GO:0031984	0.019
A2AIG8	GO:0005759	0.018
A2AIG8	GO:0031967	0.018
A2AIG8	GO:0031975	0.017
A2AIG8	GO:0042995	0.016
A2AIG8	GO:0042579	0.015
A2AIG8	GO:0098827	0.015
