In [1]:
# prepare_data.py
import pandas as pd 
from collections import defaultdict
import numpy as np
import json

ia_df = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", sep="\t", header=None, names=["go_term", "ia_weight"])

train_terms = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv", sep="\t")
# print(train_terms.head())

USE_ASPECT = None
if USE_ASPECT is not None:
    train_terms = train_terms[train_terms["aspect"] == USE_ASPECT]

train_tax = pd.read_csv(
    "/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
    sep="\t",
    header=None,
    names=["EntryID", "taxon_id"]
)

def read_fasta(path, header_type="uniprot"):
    seqs = {}
    with open(path) as f:
        cur_id = None
        cur_seq = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if cur_id is not None:
                    seqs[cur_id] = "".join(cur_seq)
                header = line[1:]
                if header_type == "uniprot":
                    parts = header.split("|")
                    cur_id = parts[1]
                else:
                    cur_id = header.split()[0]
                cur_seq = []
            else:
                cur_seq.append(line)
        if cur_id is not None:
            seqs[cur_id] = "".join(cur_seq)
    return seqs

train_seqs = read_fasta("/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta", header_type="uniprot")
test_seqs = read_fasta("/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta", header_type="simple")

print(f"#train sequences: {len(train_seqs)}")
print(f"#test sequences : {len(test_seqs)}")

prot2terms = train_terms.groupby("EntryID")["term"].apply(list)

all_terms = sorted(train_terms["term"].unique())
go2idx = {go: i for i, go in enumerate(all_terms)}
print(f"#unique GO terms (labels): {len(all_terms)}")

rows = []
tax_dict = dict(zip(train_tax["EntryID"], train_tax["taxon_id"]))

for entry_id, terms in prot2terms.items():
    if entry_id not in train_seqs:
        continue
    seq = train_seqs[entry_id]
    taxon_id = tax_dict.get(entry_id, None)
    rows.append(
        {
            "EntryID": entry_id,
            "sequence": seq,
            "taxon_id": taxon_id,
            "terms": terms
        }
    )

train_df = pd.DataFrame(rows)
print(train_df.head())
print("#final train samples:", len(train_df))

num_labels = len(all_terms)
label_matrix = np.zeros((len(train_df), num_labels), dtype=np.float32)

for i, terms in enumerate(train_df["terms"]):
    for t in terms:
        j = go2idx[t]
        label_matrix[i, j] = 1.0

train_df.to_pickle("/kaggle/working/train_df.pkl")
with open("/kaggle/working/go_terms.json", "w") as f:
    json.dump(all_terms, f)

with open("/kaggle/working/go2idx.json", "w") as f:
    json.dump(go2idx, f)
np.save("train_labels.npy", label_matrix)

test_rows = []
for entry_id, seq in test_seqs.items():
    test_rows.append({"EntryID": entry_id, "sequence": seq})
test_df = pd.DataFrame(test_rows)
test_df.to_pickle("test_df.pkl")

assert len(train_df) == label_matrix.shape[0]
assert label_matrix.shape[1] == len(all_terms)

print("예시 EntryID:", train_df.iloc[0]["EntryID"])
print("예시 terms:", train_df.iloc[0]["terms"])
print("예시 label vector non-zero idx:",
      np.where(label_matrix[0] > 0)[0][:10])

print("Saved: /kaggle/working/train_df.pkl, /kaggle/working/train_labels.npy, /kaggle/working/test_df.pkl")

#train sequences: 82404
#test sequences : 224309
#unique GO terms (labels): 26125
      EntryID                                           sequence  taxon_id  \
0  A0A023FBW4  MTSHGAVKIAIFAVIALHSIFECLSKPQILQRTDHSTDSDWDPQMC...     34607   
1  A0A023FBW7  MKVLLYIAASCLMLLALNVSAENTQQEEEDYDYGTDTCPFPVLANK...     34607   
2  A0A023FDY8  MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...     34607   
3  A0A023FF81  MTSHSAVRIAIFAVIALHSIFECLSKPQILQRTDKSTDSEWDPQTC...     34607   
4  A0A023FFB5  MKASFCVIASCLVVFALKGTAEDTGTEDDFDYGNTGCPFPVLGNYK...     34607   

          terms  
0  [GO:0019958]  
1  [GO:0019957]  
2  [GO:0019957]  
3  [GO:0019958]  
4  [GO:0019957]  
#final train samples: 82404
예시 EntryID: A0A023FBW4
예시 terms: ['GO:0019958']
예시 label vector non-zero idx: [7899]
Saved: /kaggle/working/train_df.pkl, /kaggle/working/train_labels.npy, /kaggle/working/test_df.pkl


In [2]:
# dataset.py
import torch
from torch.utils.data import Dataset
import pandas as pd 
import numpy as np

AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
AA2IDX = {aa: i + 1 for i, aa in enumerate(AMINO_ACIDS)}
UNK_IDX = len(AMINO_ACIDS) + 1

def seq_to_ids(seq, max_len=1024):
    ids = [AA2IDX.get(a, UNK_IDX) for a in seq]
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids = ids + [0] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

class CafaDataset(Dataset):
    def __init__(self, df_path, labels_path=None, max_len=1024):
        self.df = pd.read_pickle(df_path)
        self.max_len = max_len
        self.labels = None
        if labels_path is not None:
            self.labels = np.load(labels_path).astype(np.float32)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        seq = row["sequence"]
        seq_ids = seq_to_ids(seq, self.max_len)
        x = torch.tensor(seq_ids, dtype=torch.long)

        if self.labels is not None:
            y = torch.tensor(self.labels[idx], dtype=torch.float32)
            return x, y
        else:
            return x, row["EntryID"]

In [3]:
# model.py
import torch
import torch.nn as nn

class CafaCNN(nn.Module):
    def __init__(self, vocab_size, num_labels, emb_dim=128, num_filters=256, kernel_sizes=(3,5,7), dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)

        self.convs = nn.ModuleList([
            nn.Conv1d(emb_dim, num_filters, k)
            for k in kernel_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_labels)
    
    def forward(self, x):
        emb = self.embedding(x)
        emb = emb.transpose(1,2)

        conv_outs = []
        for conv in self.convs:
            c = conv(emb)
            c = torch.relu(c)
            c = torch.max(c, dim=2).values
            conv_outs.append(c)
        
        h = torch.cat(conv_outs, dim=1)
        h = self.dropout(h)
        logits = self.fc(h)
        return logits

In [4]:
# train_baseline.py
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_ds = CafaDataset("/kaggle/working/train_df.pkl", "/kaggle/working/train_labels.npy", max_len=1024)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=0)

num_labels = train_ds.labels.shape[1]
vocab_size = len(AMINO_ACIDS) + 2

model = CafaCNN(vocab_size=vocab_size, num_labels=num_labels).to(device)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(3):
    model.train()
    total_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for i, (x,y) in enumerate(pbar):
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % 100 == 0:
            print(f"Epoch {epoch+1} Step {i+1} Loss {total_loss / (i+1):.4f}")
torch.save(model.state_dict(), "/kaggle/working/cafa_cnn_baseline.pt")
print("Saved model. to cafa_cnn_baseline.pt")

Epoch 1:   0%|          | 0/2576 [00:00<?, ?it/s]

Epoch 1 Step 100 Loss 0.0193
Epoch 1 Step 200 Loss 0.0108
Epoch 1 Step 300 Loss 0.0078
Epoch 1 Step 400 Loss 0.0064
Epoch 1 Step 500 Loss 0.0055
Epoch 1 Step 600 Loss 0.0049
Epoch 1 Step 700 Loss 0.0045
Epoch 1 Step 800 Loss 0.0042
Epoch 1 Step 900 Loss 0.0039
Epoch 1 Step 1000 Loss 0.0037
Epoch 1 Step 1100 Loss 0.0036
Epoch 1 Step 1200 Loss 0.0034
Epoch 1 Step 1300 Loss 0.0033
Epoch 1 Step 1400 Loss 0.0032
Epoch 1 Step 1500 Loss 0.0031
Epoch 1 Step 1600 Loss 0.0031
Epoch 1 Step 1700 Loss 0.0030
Epoch 1 Step 1800 Loss 0.0029
Epoch 1 Step 1900 Loss 0.0029
Epoch 1 Step 2000 Loss 0.0028
Epoch 1 Step 2100 Loss 0.0028
Epoch 1 Step 2200 Loss 0.0027
Epoch 1 Step 2300 Loss 0.0027
Epoch 1 Step 2400 Loss 0.0027
Epoch 1 Step 2500 Loss 0.0027


Epoch 2:   0%|          | 0/2576 [00:00<?, ?it/s]

Epoch 2 Step 100 Loss 0.0020
Epoch 2 Step 200 Loss 0.0020
Epoch 2 Step 300 Loss 0.0020
Epoch 2 Step 400 Loss 0.0020
Epoch 2 Step 500 Loss 0.0020
Epoch 2 Step 600 Loss 0.0020
Epoch 2 Step 700 Loss 0.0020
Epoch 2 Step 800 Loss 0.0019
Epoch 2 Step 900 Loss 0.0019
Epoch 2 Step 1000 Loss 0.0019
Epoch 2 Step 1100 Loss 0.0019
Epoch 2 Step 1200 Loss 0.0019
Epoch 2 Step 1300 Loss 0.0019
Epoch 2 Step 1400 Loss 0.0019
Epoch 2 Step 1500 Loss 0.0019
Epoch 2 Step 1600 Loss 0.0019
Epoch 2 Step 1700 Loss 0.0019
Epoch 2 Step 1800 Loss 0.0019
Epoch 2 Step 1900 Loss 0.0019
Epoch 2 Step 2000 Loss 0.0019
Epoch 2 Step 2100 Loss 0.0019
Epoch 2 Step 2200 Loss 0.0019
Epoch 2 Step 2300 Loss 0.0019
Epoch 2 Step 2400 Loss 0.0019
Epoch 2 Step 2500 Loss 0.0019


Epoch 3:   0%|          | 0/2576 [00:00<?, ?it/s]

Epoch 3 Step 100 Loss 0.0019
Epoch 3 Step 200 Loss 0.0019
Epoch 3 Step 300 Loss 0.0019
Epoch 3 Step 400 Loss 0.0019
Epoch 3 Step 500 Loss 0.0019
Epoch 3 Step 600 Loss 0.0019
Epoch 3 Step 700 Loss 0.0019
Epoch 3 Step 800 Loss 0.0019
Epoch 3 Step 900 Loss 0.0019
Epoch 3 Step 1000 Loss 0.0019
Epoch 3 Step 1100 Loss 0.0019
Epoch 3 Step 1200 Loss 0.0019
Epoch 3 Step 1300 Loss 0.0019
Epoch 3 Step 1400 Loss 0.0019
Epoch 3 Step 1500 Loss 0.0019
Epoch 3 Step 1600 Loss 0.0019
Epoch 3 Step 1700 Loss 0.0019
Epoch 3 Step 1800 Loss 0.0019
Epoch 3 Step 1900 Loss 0.0019
Epoch 3 Step 2000 Loss 0.0019
Epoch 3 Step 2100 Loss 0.0019
Epoch 3 Step 2200 Loss 0.0019
Epoch 3 Step 2300 Loss 0.0019
Epoch 3 Step 2400 Loss 0.0019
Epoch 3 Step 2500 Loss 0.0019
Saved model. to cafa_cnn_baseline.pt


In [5]:
# predict and make_submission
import torch
from torch.utils.data import DataLoader
import pandas as pd 
import numpy as np
import json
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open("/kaggle/working/go_terms.json", "r") as f:
    go_terms = json.load(f)
num_labels = len(go_terms)

test_ds = CafaDataset("/kaggle/working/test_df.pkl", labels_path=None, max_len=1024)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=4)

vocab_size = len(AMINO_ACIDS) + 2
model = CafaCNN(vocab_size=vocab_size, num_labels=num_labels).to(device)
state_dict = torch.load("/kaggle/working/cafa_cnn_baseline.pt", map_location=device)
model.load_state_dict(state_dict)
model.eval()

rows = []

THRESHOLD = 0.5
MAX_TERMS_PER_PROT = 50

with torch.no_grad():
    pbar = tqdm(test_loader, desc="Predicting on test")
    for batch in pbar:
        x, entry_ids = batch
        x = x.to(device)

        logits = model(x)
        probs = torch.sigmoid(logits)
        probs = probs.cpu().numpy()

        for eid, prob_vec in zip(entry_ids, probs):
            idxs = np.where(prob_vec >= THRESHOLD)[0]

            if len(idxs) > MAX_TERMS_PER_PROT:
                topk_idxs = np.argsort(prob_vec)[::-1][:MAX_TERMS_PER_PROT]
                idxs = np.intersect1d(idxs, topk_idxs)
            
            for j in idxs:
                rows.append((eid, go_terms[j], float(prob_vec[j])))

# sample = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/sample_submission.tsv", sep="\t")
# col_names = sample.columns.tolist()

# sub_df = pd.DataFrame(rows, columns=col_names)

sub_df = pd.DataFrame(rows, columns=["target","term","score"])

print("submission rows:", len(sub_df))
print(sub_df.head())

# 6. 제출 파일 저장
sub_df.to_csv("/kaggle/working/submission.tsv", sep="\t", index=False, header=False)
print("Saved submission_baseline.tsv")

Predicting on test:   0%|          | 0/3505 [00:00<?, ?it/s]

submission rows: 22880
   target        term     score
0  A0JNW5  GO:0005515  0.584306
1  A0JP26  GO:0005515  0.501086
2  A1A519  GO:0005515  0.642656
3  A1X283  GO:0005515  0.551929
4  A2RUB6  GO:0005515  0.541027
Saved submission_baseline.tsv
