In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cafa-6-protein-function-prediction/sample_submission.tsv
/kaggle/input/cafa-6-protein-function-prediction/IA.tsv
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta
/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo


Cell 1

In [2]:
BASE_DIR = "/kaggle/input/cafa-6-protein-function-prediction"

TRAIN_DIR = f"{BASE_DIR}/Train"
TEST_DIR = f"{BASE_DIR}/Test"

# Train files
TRAIN_FASTA = f"{TRAIN_DIR}/train_sequences.fasta"
TRAIN_TERMS = f"{TRAIN_DIR}/train_terms.tsv"
TRAIN_TAXON = f"{TRAIN_DIR}/train_taxonomy.tsv"
GO_OBO = f"{TRAIN_DIR}/go-basic.obo"
IA_FILE = f"{BASE_DIR}/IA.tsv"
SAMPLE_SUB = f"{BASE_DIR}/sample_submission.tsv"

# Test files
TEST_SUPERSET_FASTA  = f"{TEST_DIR}/testsuperset.fasta"
TEST_TAXON_LIST = f"{TEST_DIR}/testsuperset-taxon-list.tsv"

import os

for p in [TRAIN_FASTA, TRAIN_TERMS, GO_OBO, IA_FILE, TEST_SUPERSET_FASTA, TEST_TAXON_LIST]:
    print(p, "->", os.path.exists(p))



/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta -> True
/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv -> True
/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo -> True
/kaggle/input/cafa-6-protein-function-prediction/IA.tsv -> True
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta -> True
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv -> True


In [3]:
!pip install --quiet fair-esm
!pip install biopython


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython
  Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [4]:
import esm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

esm_model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
esm_model = esm_model.to(device).eval()

batch_converter = alphabet.get_batch_converter()

print("ESM Loaded ✅")


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t6_8M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t6_8M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t6_8M_UR50D-contact-regression.pt


ESM Loaded ✅


In [5]:
from Bio import SeqIO


def clean_id(raw):
    parts = raw.split("|")
    if len(parts) >= 2:
        return parts[1]
    return parts[0].split()[0]

train_sequences = {}
for rec in SeqIO.parse(TRAIN_FASTA, "fasta"):
    pid = clean_id(rec.id)
    train_sequences[pid] = str(rec.seq)

test_sequences = {}
for rec in SeqIO.parse(TEST_SUPERSET_FASTA, "fasta"):
    pid = clean_id(rec.id)
    test_sequences[pid] = str(rec.seq)

print("Train proteins:", len(train_sequences))
print("Test proteins:", len(test_sequences))


Train proteins: 82404
Test proteins: 224309


In [6]:

df_terms = pd.read_csv(TRAIN_TERMS, sep="\t")

go_terms = sorted(df_terms["term"].unique())
go2idx = {go:i for i,go in enumerate(go_terms)}
idx2go = {i:go for go,i in go2idx.items()}

num_classes = len(go2idx)
print("Num GO terms:", num_classes)


Num GO terms: 26125


In [7]:
labels = {pid: [] for pid in train_sequences}

for row in df_terms.itertuples():
    pid = row.EntryID
    go  = row.term
    
    if pid in labels and go in go2idx:
        labels[pid].append(go2idx[go])


In [8]:
MAX_LEN = 800

@torch.no_grad()
def embed_sequence(seq):
    seq = seq[:MAX_LEN]
    batch = [("protein", seq)]
    _, _, toks = batch_converter(batch)

    toks = toks.to(device)

    out = esm_model(toks, repr_layers=[6])
    reps = out["representations"][6][0][1:-1]
    emb = reps.mean(0)

    return emb.cpu()



In [9]:
from tqdm import tqdm

train_embeddings = {}

for pid, seq in tqdm(train_sequences.items()):
    train_embeddings[pid] = embed_sequence(seq)

torch.save(train_embeddings, "/kaggle/working/train_emb.pt")
print("Train embeddings saved ✅")


100%|██████████| 82404/82404 [18:30<00:00, 74.22it/s]


Train embeddings saved ✅


In [10]:
from torch.utils.data import Dataset, DataLoader

class TrainDataset(Dataset):
    def __init__(self, emb, labels, n_classes):
        self.ids = list(emb.keys())
        self.emb = emb
        self.labels = labels
        self.n_classes = n_classes

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        pid = self.ids[idx]
        x = self.emb[pid]

        y = torch.zeros(self.n_classes)
        for t in self.labels[pid]:
            y[t] = 1.0

        return x, y

ds = TrainDataset(train_embeddings, labels, num_classes)
loader = DataLoader(ds, batch_size=32, shuffle=True)


In [11]:
import torch.nn as nn

class GOModel(nn.Module):
    def __init__(self, dim, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, 512),
            nn.ReLU(),
            nn.Linear(512, n_classes)
        )

    def forward(self, x):
        return self.net(x)


In [12]:
embedding_dim = next(iter(train_embeddings.values())).shape[0]

model = GOModel(embedding_dim, num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

EPOCHS = 5

for epoch in range(EPOCHS):
    total = 0
    for x, y in loader:
        x = x.to(device)
        y = y.to(device)

        opt.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        opt.step()

        total += loss.item()

    print(f"Epoch {epoch+1} | loss = {total/len(loader):.4f}")


Epoch 1 | loss = 0.0044
Epoch 2 | loss = 0.0016
Epoch 3 | loss = 0.0015
Epoch 4 | loss = 0.0014
Epoch 5 | loss = 0.0013


In [13]:
test_embeddings = {}

for pid, seq in tqdm(test_sequences.items()):
    test_embeddings[pid] = embed_sequence(seq)

torch.save(test_embeddings, "/kaggle/working/test_emb.pt")


100%|██████████| 224309/224309 [43:08<00:00, 86.67it/s] 


In [14]:
OUTPUT = "/kaggle/working/submission.tsv"

model.eval()

with open(OUTPUT, "w") as f:
    for pid in tqdm(test_sequences.keys()):

        emb = test_embeddings[pid].to(device)

        with torch.no_grad():
            logits = model(emb)
            probs = torch.sigmoid(logits).cpu().numpy()

        for i, p in enumerate(probs):
            if p > 0.01:
                go = idx2go[i]
                f.write(f"{pid}\t{go}\t{p:.3f}\n")

print("Submission saved ✅:", OUTPUT)


100%|██████████| 224309/224309 [2:38:34<00:00, 23.57it/s]

Submission saved ✅: /kaggle/working/submission.tsv



