In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import gc
import glob
import networkx as nx

# --- CÀI ĐẶT ---
try:
    import obonet
except ImportError:
    os.system('pip install obonet networkx')
    import obonet

# --- CẤU HÌNH ---
CFG = {
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'batch_size': 128,      
    'epochs': 25,           
    'lr': 1e-3,           
    'top_n_terms': 3000,    
    'input_dim': 1280,    
    'data_path': '/kaggle/input/cafa-6-protein-function-prediction'
}

print(f">>> TRAINING 1D-CNN ON: {CFG['device']}")

In [None]:
# ==========================================
# 1. LOAD DATA (AUTO-DETECT)
# ==========================================
print("\n[1/5] LOADING DATA...")
def find_npy_file(keyword):
    paths = glob.glob(f'/kaggle/input/**/*{keyword}*.npy', recursive=True)
    preferred = [p for p in paths if '650M' in p]
    return max(preferred, key=os.path.getsize) if preferred else max(paths, key=os.path.getsize)

try:
    X_train_emb = np.load(find_npy_file('train'))
    X_test_emb = np.load(find_npy_file('test'))
    print(f" -> Loaded: Train={X_train_emb.shape}, Test={X_test_emb.shape}")
except:
    raise FileNotFoundError("Chưa Add Dataset chứa file .npy!")

# Metadata
def get_clean_ids(path):
    ids = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('>'):
                ids.append(line.strip().split('|')[1] if '|' in line else line.strip()[1:].split()[0])
    return ids

train_ids = get_clean_ids(f"{CFG['data_path']}/Train/train_sequences.fasta")
test_ids = get_clean_ids(f"{CFG['data_path']}/Test/testsuperset.fasta")
train_terms = pd.read_csv(f"{CFG['data_path']}/Train/train_terms.tsv", sep='\t')
top_terms = train_terms['term'].value_counts().head(CFG['top_n_terms']).index.tolist()

# Mapping
train_id_map = {pid: i for i, pid in enumerate(train_ids)}
valid_terms = train_terms[train_terms['EntryID'].isin(train_ids) & train_terms['term'].isin(top_terms)]
Y_df = valid_terms.pivot_table(index='EntryID', columns='term', aggfunc='size', fill_value=0)
Y_df = (Y_df > 0).astype(int)

common_ids = sorted(list(set(train_ids) & set(Y_df.index)))
x_indices = [train_id_map[pid] for pid in common_ids]

X_final = X_train_emb[x_indices]
Y_final = Y_df.loc[common_ids].values
target_names = Y_df.columns.tolist()

del X_train_emb, train_terms, valid_terms, Y_df
gc.collect()

X_train, X_val, y_train, y_val = train_test_split(X_final, Y_final, test_size=0.1, random_state=42)

In [None]:
# ==========================================
# 2. MODEL: 1D-CNN
# ==========================================
class ProteinDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = torch.FloatTensor(x)
        self.y = torch.FloatTensor(y) if y is not None else None
    def __len__(self): return len(self.x)
    def __getitem__(self, idx):
        if self.y is not None: return self.x[idx], self.y[idx]
        return self.x[idx]

class ResidualBlock1D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding=padding)
        self.bn2 = nn.BatchNorm1d(out_channels)
        if in_channels != out_channels:
            self.shortcut = nn.Conv1d(in_channels, out_channels, 1)
        else:
            self.shortcut = nn.Identity()
            
    def forward(self, x):
        residual = self.shortcut(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += residual
        return self.relu(out)

class ProteinCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
        
            ResidualBlock1D(32, 64),
            nn.MaxPool1d(2),
            
            ResidualBlock1D(64, 128),
            nn.MaxPool1d(2),
            
            ResidualBlock1D(128, 256),
            nn.AdaptiveAvgPool1d(1)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, num_classes)
        )
        
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.features(x)
        return self.classifier(x)

In [None]:
# ==========================================
# 3. TRAINING
# ==========================================
print("\n[2/5] TRAINING 1D-CNN...")

train_loader = DataLoader(ProteinDataset(X_train, y_train), batch_size=CFG['batch_size'], shuffle=True)
val_loader = DataLoader(ProteinDataset(X_val, y_val), batch_size=CFG['batch_size'])

model = ProteinCNN(CFG['input_dim'], len(target_names)).to(CFG['device'])
if torch.cuda.device_count() > 1: model = nn.DataParallel(model)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=CFG['lr'], weight_decay=1e-2)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

best_loss = float('inf')
patience = 0

for epoch in range(CFG['epochs']):
    model.train()
    train_loss = 0
    for x, y in train_loader:
        x, y = x.to(CFG['device']), y.to(CFG['device'])
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(CFG['device']), y.to(CFG['device'])
            val_loss += criterion(model(x), y).item()
            
    avg_val = val_loss / len(val_loader)
    print(f"Epoch {epoch+1:02d} | Val Loss: {avg_val:.4f}")
    
    scheduler.step(avg_val)
    if avg_val < best_loss:
        best_loss = avg_val
        # Lưu model
        if isinstance(model, nn.DataParallel):
            torch.save(model.module.state_dict(), 'best_cnn_model.pth')
        else:
            torch.save(model.state_dict(), 'best_cnn_model.pth')
        patience = 0
    else:
        patience += 1
        if patience >= 5:
            print("Early Stopping.")
            break

In [None]:
# ==========================================
# 4. PREDICT & PROPAGATE
# ==========================================
print("\n[3/5] PREDICTING...")
model = ProteinCNN(CFG['input_dim'], len(target_names)).to(CFG['device'])
model.load_state_dict(torch.load('best_cnn_model.pth'))
if torch.cuda.device_count() > 1: model = nn.DataParallel(model)
model.eval()

test_loader = DataLoader(ProteinDataset(X_test_emb), batch_size=256, shuffle=False)
preds = []
with torch.no_grad():
    for x in tqdm(test_loader):
        preds.append(torch.sigmoid(model(x.to(CFG['device']))).cpu().numpy())
final_probs = np.vstack(preds)

print("\n[4/5] APPLYING ONTOLOGY PROPAGATION...")
try:
    obo_path = f"{CFG['data_path']}/Train/go-basic.obo"
    go_path = obo_path if os.path.exists(obo_path) else "http://purl.obolibrary.org/obo/go/go-basic.obo"
    go_graph = obonet.read_obo(go_path)
    
    term_to_idx = {t: i for i, t in enumerate(target_names)}
    term_parents = {}
    for term, data in go_graph.nodes(data=True):
        if 'is_a' in data:
            parents = [p for p in data['is_a'] if p in term_to_idx]
            if parents: term_parents[term] = parents

    for _ in range(2):
        for child, parents in term_parents.items():
            if child in term_to_idx:
                c_idx = term_to_idx[child]
                c_scores = final_probs[:, c_idx]
                for p in parents:
                    p_idx = term_to_idx[p]
                    final_probs[:, p_idx] = np.maximum(final_probs[:, p_idx], c_scores)
    print(" -> Propagation Done.")
except: pass

In [None]:
# ==========================================
# 5. WRITE SUBMISSION 
# ==========================================
print("\n[5/5] WRITING SUBMISSION CNN...")
with open('submission.tsv', 'w') as f: pass
with open('submission.tsv', 'a') as f:
    CHUNK = 20000
    for i in tqdm(range(0, len(test_ids), CHUNK)):
        end = min(i+CHUNK, len(test_ids))
        chunk_p = final_probs[i:end]
        chunk_id = test_ids[i:end]
        rows, cols = np.where(chunk_p > 0.01)
        if len(rows)>0:
            df = pd.DataFrame({
                'id': np.array(chunk_id)[rows],
                'term': np.array(target_names)[cols],
                'score': chunk_p[rows,cols]
            })
            df['score'] = df['score'].map('{:.3f}'.format)
            df.to_csv(f, header=False, index=False, sep='\t')

print("\nALL DONE")
