In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cafa5-additional-data/train_terms.tsv
/kaggle/input/cafa5-additional-data/train_sequences.fasta
/kaggle/input/final-uniprot/uni_term.tsv
/kaggle/input/final-uniprot/uni_seq.tsv
/kaggle/input/protein-go-annotations/goa_uniprot_all.csv
/kaggle/input/cafa-6-protein-function-prediction/sample_submission.tsv
/kaggle/input/cafa-6-protein-function-prediction/IA.tsv
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta
/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo


In [2]:
import kagglehub
path = kagglehub.dataset_download("seddiktrk/protein-go-annotations")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/protein-go-annotations


In [3]:
!pip install biopython --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from pathlib import Path
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import defaultdict
import subprocess
from subprocess import Popen, PIPE

In [5]:
def create_df_tsv(tsv_file):
    df = pd.read_csv(tsv_file, sep = "\t")
    return df

In [6]:
def create_df(data_root, seq_file, has_metadata):
    # create dataframe with 2 columns: EntryID + Seq
    root = Path(data_root)

    ids = []
    seqs = []

    with open(root / seq_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            if has_metadata:
                id_only = record.id.split('|')[1]
                ids.append(id_only)
            else:
                ids.append(record.id)
            seqs.append(str(record.seq))
    final_df = pd.DataFrame({'EntryID': ids, 'Seq': seqs})
    return final_df

In [7]:
train_cafa6_df = create_df(
    data_root="/kaggle/input/cafa-6-protein-function-prediction",
    seq_file="Train/train_sequences.fasta",
    has_metadata=True
)

train_cafa5_df = create_df(
    data_root="/kaggle/input/cafa5-additional-data",
    seq_file="train_sequences.fasta",
    has_metadata=False
)

train_uniprot_df = create_df_tsv("/kaggle/input/final-uniprot/uni_seq.tsv")

test_df = create_df(
    data_root="/kaggle/input/cafa-6-protein-function-prediction",
    seq_file="Test/testsuperset.fasta",
    has_metadata=False
)

In [8]:
train_uniprot_df.head(5)

Unnamed: 0,EntryID,Seq
0,A1CHB5,MLRSSITQSRQLLLSPARSRTASQWLPRAGASNRISGQRFFADVKP...
1,Q9ZFU7,MKKTRTANLHHLYHEALPEDVKLTPRVEVDNVHQRRTTDVYEHALT...
2,A0A2I6PJ05,MVPNANSNTVSLQSPNAIPPRTSSTGYITPFPPAKSVLRPVPESDW...
3,Q6CKI3,MFRLVQQQTLKSRVPNQFVSASRNSLNSQFRFNSAVALERNPQQDP...
4,O25943,MRDFNNAQITRLKVRQNAVFEKLDLEFKDGLSAISGASGVGKSVLI...


In [9]:
def create_file_from_df(df, name):
    records = []
    for _, row in df.iterrows():
        rec = SeqRecord(
            Seq(row['Seq']),
            id=row['EntryID'],
            description=""
        )
        records.append(rec)
    
    fasta_path = name
    SeqIO.write(records, fasta_path, "fasta")

In [10]:
create_file_from_df(train_cafa6_df, "train_cafa6.fasta")
create_file_from_df(train_cafa5_df, "train_cafa5.fasta")
create_file_from_df(train_uniprot_df, "train_uniprot.fasta")
create_file_from_df(test_df, "test.fasta")

In [11]:
def fix_duplicate(fasta_file, fasta_unique):
    # check if the file contains duplicate entries
    
    records = list(SeqIO.parse(fasta_file, "fasta"))

    # Lọc trùng: dùng (id, seq) làm key
    seen = set()
    unique_records = []
    for r in records:
        key = (r.id, str(r.seq))
        if key not in seen:
            seen.add(key)
            unique_records.append(r)
    if len(records) == len (unique_records):
        print("no duplicates found.")
    else:
        SeqIO.write(unique_records, fasta_unique, "fasta")
        print(f"{len(records)} -> {len(unique_records)} records unique")

In [12]:
fix_duplicate("/kaggle/working/train_cafa5.fasta", "train_cafa5_unique.fasta")
fix_duplicate("/kaggle/working/train_cafa6.fasta", "train_cafa6_unique.fasta")
fix_duplicate("/kaggle/working/train_uniprot.fasta", "train_uniprot_unique.fasta")

no duplicates found.
no duplicates found.
no duplicates found.


In [13]:
!wget http://github.com/bbuchfink/diamond/releases/download/v2.1.6/diamond-linux64.tar.gz
!tar xzf diamond-linux64.tar.gz
!rm diamond-linux64.tar.gz

URL transformed to HTTPS due to an HSTS policy
--2025-12-09 08:24:22--  https://github.com/bbuchfink/diamond/releases/download/v2.1.6/diamond-linux64.tar.gz
Resolving github.com (github.com)... 4.237.22.38
Connecting to github.com (github.com)|4.237.22.38|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/31987083/6b15e096-093f-4e8e-b9ff-5e7bb45db282?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-12-09T09%3A21%3A21Z&rscd=attachment%3B+filename%3Ddiamond-linux64.tar.gz&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-12-09T08%3A20%3A40Z&ske=2025-12-09T09%3A21%3A21Z&sks=b&skv=2018-11-09&sig=hpKAcDX%2BDX83UUkmo1oEdNeZG7Dz2LmEBbeVnn582jY%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2NTI3MDQ2MiwibmJ

In [14]:
def create_database(db_name, data_path):
    p = Popen(['./diamond', 'makedb', 
           '--in', data_path,
            '-d', db_name], stdin=PIPE, stdout=PIPE)
    stdout, stderr = p.communicate()

In [15]:
def matches(db_name, query_path, outfile_name, k):
    #outfile_name has to be .fasta file!
    p = Popen(['./diamond', 'blastp', '-d', db_name,
           '-q', query_path,
            '-o', outfile_name, '--max-target-seqs', str(k), '--quiet'], stdin=PIPE, stdout=PIPE)
    stdout, stderr = p.communicate()

In [16]:
create_database("train_cafa5_db", "/kaggle/working/train_cafa5.fasta")
create_database("train_cafa6_db", "/kaggle/working/train_cafa6.fasta")
create_database("train_uniprot_db", "/kaggle/working/train_uniprot.fasta")

diamond v2.1.6.160 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 4
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /kaggle/working/train_cafa5.fasta
Opening the database file...  [0.003s]
Loading sequences...  [0.419s]
Masking sequences...  [2.284s]
Writing sequences...  [0.081s]
Hashing sequences...  [0.025s]
Loading sequences...  [0s]
Writing trailer...  [0.001s]
Closing the input file...  [0s]
Closing the database file...  [0.002s]

Database sequences  142246
  Database letters  78752603
     Database hash  63a90e7790c6476a9be89aaadd1b422d
        Total time  2.817000s
diamond v2.1.6.160 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s4159

In [17]:
matches("train_cafa5_db", "/kaggle/working/test.fasta", "cafa5_matches.fasta", 16)
matches("train_cafa6_db", "/kaggle/working/test.fasta", "cafa6_matches.fasta", 16)
matches("train_uniprot_db", "/kaggle/working/test.fasta", "uniprot_matches.fasta", 16)

In [18]:
def create_go_map(terms_file, csv_file):
    go_map = defaultdict(list)
    df = pd.DataFrame()
    if csv_file:
        df = pd.read_csv(terms_file, sep = ",")
    else:
        df = pd.read_csv(terms_file, sep = "\t")
    
    for entry, term in zip(df['EntryID'], df['term']):
        go_map[entry].append(term)

    return go_map

In [19]:
go_map_cafa6 = create_go_map("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv", False)
go_map_cafa5 = create_go_map("/kaggle/input/cafa5-additional-data/train_terms.tsv", False)
go_map_uniprot = create_go_map("/kaggle/input/final-uniprot/uni_term.tsv", False)

In [20]:
def parse_go_obo(obo_path):
    go_parents = defaultdict(set)
    current_id = None

    with open(obo_path, "r") as f:
        for line in f:
            line = line.strip()
            if line.startswith("id: GO:"):
                current_id = line.split("id: ")[1]
            elif line.startswith("is_a: GO:") and current_id:
                parent = line.split("is_a: ")[1].split(" !")[0]
                go_parents[current_id].add(parent)
    return go_parents

In [21]:
go_parents = parse_go_obo(
    "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
)

In [22]:
#blast knn
from collections import defaultdict
import pandas as pd

def compute_knn(hits_file, go_map, evalue_thresh=0.001):
    # đọc hits
    hits_df = pd.read_csv(hits_file, sep="\t", header=None,
                          names=['query_id','subject_id','pid','aln_len','mismatch','gap',
                                 'qstart','qend','sstart','send','evalue','bitscore'])
    
    results = defaultdict(dict)
    
    for q, group in hits_df.groupby('query_id'):
        # lọc e-value < threshold
        group = group[group['evalue'] < evalue_thresh]
        if group.empty:
            continue
        
        Bsum = group['bitscore'].sum()
        go_score = defaultdict(float)
        
        for _, row in group.iterrows():
            subject = row['subject_id']
            bits = row['bitscore']
            for go in go_map.get(subject, []):
                go_score[go] += bits
        
        # normalize theo tổng bitscore
        for go in go_score:
            go_score[go] /= Bsum
        
        results[q] = dict(go_score)
    
    return results


In [23]:
results_cafa_6 = compute_knn("/kaggle/working/cafa6_matches.fasta", go_map_cafa6)
results_cafa_5 = compute_knn("/kaggle/working/cafa5_matches.fasta", go_map_cafa5)
results_uniprot = compute_knn("/kaggle/working/uniprot_matches.fasta", go_map_uniprot)

In [24]:
def merge_many_results(*results_dicts):
    final_results = defaultdict(dict)

    for res in results_dicts:
        for q, go_dict in res.items():
            for go, score in go_dict.items():
                if go in final_results[q]:
                    final_results[q][go] = max(final_results[q][go], score)
                else:
                    final_results[q][go] = score

    return final_results

In [25]:
final = merge_many_results(results_cafa_6, results_cafa_5, results_uniprot)

In [26]:
def propagate_go_hierarchy(results, go_parents, decay=0.5):
    new_results = defaultdict(dict)

    for q, go_dict in results.items():
        new_results[q] = dict(go_dict)

        for go, score in go_dict.items():
            for parent in go_parents.get(go, []):
                parent_score = score * decay

                if parent in new_results[q]:
                    new_results[q][parent] = max(
                        new_results[q][parent], parent_score
                    )
                else:
                    new_results[q][parent] = parent_score

    return new_results

In [27]:
final = propagate_go_hierarchy(final, go_parents, decay=0.5)

with open("/kaggle/working/submission.tsv", "w") as f:
    f.write("query_id\tGO_term\tscore\n")
    
    for q in test_df["EntryID"]:
        if q not in final:
            continue
        
        for go, score in final[q].items():
            if score >= 0.01:
                f.write(f"{q}\t{go}\t{score:.6f}\n")