In [1]:
import os,sys

import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.expand_frame_repr', False)

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from timeit import default_timer as timer
import re
import optuna
import matplotlib 
import matplotlib.pyplot as plt
from pathlib import Path


# helper--
class dotdict(dict):
	__setattr__ = dict.__setitem__
	__delattr__ = dict.__delitem__

	def __getattr__(self, name):
		try:
			return self[name]
		except KeyError:
			raise AttributeError(name)

def time_to_str(t, mode='min'):
	if mode=='min':
		t  = int(t)/60
		hr = t//60
		min = t%60
		return '%2d hr %02d min'%(hr,min) 
	elif mode=='sec':
		t   = int(t)
		min = t//60
		sec = t%60
		return '%2d min %02d sec'%(min,sec)

	else:
		raise NotImplementedError

def gpu_memory_use():
    if torch.cuda.is_available():
        device = torch.device(0)
        free, total = torch.cuda.mem_get_info(device)
        used= (total - free) / 1024 ** 3
        return int(round(used))
    else:
        return 0

def set_aspect_equal(ax):
	x_limits = ax.get_xlim()
	y_limits = ax.get_ylim()
	z_limits = ax.get_zlim()

	# Compute the mean of each axis
	x_middle = np.mean(x_limits)
	y_middle = np.mean(y_limits)
	z_middle = np.mean(z_limits)

	# Compute the max range across all axes
	max_range = max(x_limits[1] - x_limits[0],
					y_limits[1] - y_limits[0],
					z_limits[1] - z_limits[0]) / 2.0

	# Set the new limits to ensure equal scaling
	ax.set_xlim(x_middle - max_range, x_middle + max_range)
	ax.set_ylim(y_middle - max_range, y_middle + max_range)
	ax.set_zlim(z_middle - max_range, z_middle + max_range)


print('torch',torch.__version__)
print('torch.cuda',torch.version.cuda)

print('IMPORT OK!!!')

torch 2.5.1+cu121
torch.cuda 12.1
IMPORT OK!!!


In [2]:
#tuning
class SequenceStructureDataset(Dataset):
    def __init__(self, csv_file, data_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.data_dir = Path(data_dir)
        self.transform = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        features = np.load(self.data_dir / f"{row['ID']}_features.npy")
        coords   = np.load(self.data_dir / f"{row['ID']}_coords.npy")
        sample = {'features': torch.from_numpy(features).float(),
                  'coords':   torch.from_numpy(coords).float()}
        return self.transform(sample) if self.transform else sample


DATA_KAGGLE_DIR = '/kaggle/input/stanford-rna-3d-folding'

train_dataset = SequenceStructureDataset(
    csv_file=f"{DATA_KAGGLE_DIR}/train_labels.csv",
    data_dir=f"{DATA_KAGGLE_DIR}/train_data"
)
val_dataset = SequenceStructureDataset(
    csv_file=f"{DATA_KAGGLE_DIR}/validation_labels.csv",
    data_dir=f"{DATA_KAGGLE_DIR}/validation_data"
)

In [3]:
# cfg = dict(
#         seq_dim=6,
#         msa_dim=7,
#         N_ensemble=1,   # how many ensemble members
#         N_cycle=8,      # how many recycling cycles
#         m_dim=64,
#     s_in_dim=5,
#     z_in_dim=2,
#     s_dim= 512,
#     z_dim= 128,
#     N_elayers=18,
#     )
# def train_and_evaluate(
#     cfg: dict,
#     lr: float = 1e-4,
#     weight_decay: float = 1e-5,
#     batch_size: int = 8,
#     T_max: int = 10,
#     num_epochs: int = 20,
#     freeze_embed: bool = True,
#     scheduler_type: str = 'cosine',
#     optimizer_type: str = 'adamw',
#     warmup_ratio: float = 0.0,
#     device: torch.device = None
# ) -> float:
#     """
#     Fine-tunes DRfold2Model with the given configuration and returns average validation loss.
#     Supports choosing optimizer, scheduler, and optional warmup.
#     """
#     device = device or (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

#     # DataLoader for this run
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
#     val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=4)

#     # Instantiate model
#     model = DRfold2Model(cfg)
#     model.load_state_dict(torch.load(
#         '/kaggle/working/drfold/model_hub/drfold2_pretrained.pth'
#     ))
#     if freeze_embed:
#         for name, param in model.named_parameters():
#             if 'embed' in name:
#                 param.requires_grad = False
#     model.to(device)

#     # Choose optimizer
#     if optimizer_type == 'adamw':
#         optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
#     else:
#         optimizer = SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay, momentum=0.9)

#     # Choose scheduler
#     if scheduler_type == 'cosine':
#         scheduler = CosineAnnealingLR(optimizer, T_max=T_max)
#     else:
#         scheduler = StepLR(optimizer, step_size=max(1, T_max//2), gamma=0.1)

#     # Optional warmup via LambdaLR
#     if warmup_ratio > 0.0:
#         warmup_steps = int(warmup_ratio * num_epochs)
#         def lr_lambda(epoch):
#             return min((epoch + 1) / warmup_steps, 1.0)
#         scheduler = LambdaLR(optimizer, lr_lambda)

#     criterion = nn.MSELoss()

#     # Training loop
#     for epoch in range(1, num_epochs + 1):
#         model.train()
#         total_loss = 0.0
#         for batch in train_loader:
#             feats  = batch['features'].to(device)
#             target = batch['coords'].to(device)
#             optimizer.zero_grad()
#             pred   = model(feats)
#             loss   = criterion(pred, target)
#             loss.backward()
#             optimizer.step()
#         scheduler.step()

#     # Validation
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for batch in val_loader:
#             feats  = batch['features'].to(device)
#             target = batch['coords'].to(device)
#             val_loss += criterion(model(feats), target).item()

#     avg_val_loss = val_loss / len(val_loader)
#     return avg_val_loss

# smoke_loss = train_and_evaluate(
#     cfg=cfg,
#     lr=1e-4,
#     weight_decay=1e-5,
#     batch_size=4,
#     T_max=5,
#     num_epochs=1,
#     freeze_embed=True
# )
# print("Validation loss after 1 epoch:", smoke_loss)

In [4]:
from datetime import datetime
import pytz
print('LOGGING TIME OF START:',  datetime.strftime(datetime.now(pytz.timezone('Asia/Singapore')), "%Y-%m-%d %H:%M:%S"))


try:
    import Bio
except:
    #for drfold2 --------
    #!pip install biopython
    !pip install /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install biopython
print('PIP INSTALL OK !!!!')

LOGGING TIME OF START: 2025-05-13 06:31:15
Processing /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: biopython
Successfully installed biopython-1.85
Processing /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
biopython is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
PIP INSTALL OK !!!!


In [5]:
# MODE = 'local' #'local' # submit
MODE = 'submit'

DATA_KAGGLE_DIR = '/kaggle/input/stanford-rna-3d-folding'
if MODE == 'local':
    valid_df = pd.read_csv(f'{DATA_KAGGLE_DIR}/validation_sequences.csv')
    label_df = pd.read_csv(f'{DATA_KAGGLE_DIR}/validation_labels.csv')
    label_df['target_id'] = label_df['ID'].apply(lambda x: '_'.join(x.split('_')[:-1]))

if MODE == 'submit':
	valid_df = pd.read_csv(f'{DATA_KAGGLE_DIR}/test_sequences.csv')

print('len(valid_df)',len(valid_df))
print(valid_df.iloc[0])
print('')


# cfg = dotdict(
#     num_conf = 5,
#     max_length=480,
# )
NUM_CONF=5
MAX_LENGTH=480
DEVICE='cuda' #'cpu'

print('MODE:', MODE)
print('SETTING OK!!!')

len(valid_df) 12
target_id                                                      R1107
sequence           GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUU...
temporal_cutoff                                           2022-05-28
description        CPEB3 ribozyme\nHuman\nhuman CPEB3 HDV-like ri...
all_sequences      >7QR4_1|Chain A|U1 small nuclear ribonucleopro...
Name: 0, dtype: object

MODE: submit
SETTING OK!!!


In [6]:
import os
os.makedirs('/kaggle/working/drfold', exist_ok=True)
!cp -r /kaggle/input/drfold/DRfold2/DRfold2/* /kaggle/working/drfold/

In [7]:
# import sys, os
# CFG97 = os.path.join('/kaggle/input/drfold/DRfold2/DRfold2', 'cfg_97')
# assert os.path.isdir(CFG97), f"{CFG97} not found!"
# sys.path.insert(0, CFG97)

# # now this should succeed:
# from EvoMSA2XYZ import MSA2XYZ
# from RNALM2.Model import RNA2nd
# from data         import parse_seq, Get_base, BASE_COOR
# from data         import write_frame_coor_to_pdb, parse_pdb_to_xyz

# print("imported!", MSA2XYZ, RNA2nd)

In [8]:
import os
import numpy as np

# 1) Point at the cfg_97 folder (adjust this to your actual path)
CFG97 = '/kaggle/input/drfold/DRfold2/DRfold2/cfg_97'

# 2) Load the base.npy file once into a Python variable
BASE_COOR = np.load(os.path.join(CFG97, 'base.npy'))

# 3) Now import your parsing and model code
import sys
sys.path.insert(0, CFG97)
from data import parse_seq, Get_base
# (no BASE_COOR to import from data.py)

# 4) When you need the 3×3 base coordinates for a sequence, call:
sequence = "ACGUACGUA"
base_coords = Get_base(sequence, BASE_COOR)
# base_coords.shape == (len(sequence), 3, 3)

In [9]:
import os, sys, shutil

# 1) make sure you have a writable copy
shutil.copytree(
  '/kaggle/input/drfold/DRfold2/DRfold2',
  '/kaggle/working/drfold',
  dirs_exist_ok=True
)

# 2) point Python at the cfg_97 folder under that copy
BASE = '/kaggle/working/drfold'
sys.path.insert(0, os.path.join(BASE, 'cfg_97'))
from EvoMSA2XYZ import MSA2XYZ
from RNALM2.Model import RNA2nd
from data import parse_seq, Get_base
from util import outpdb
# parse_pdb_to_xyz write_frame_coor_to_pdb

import numpy as np

def parse_pdb_to_xyz(pdb_file, atom_name=" P  "):
    coords, resid, resname = [], [], []
    with open(pdb_file) as f:
        for line in f:
            if line.startswith("ATOM") and line[12:16].strip() == atom_name.strip():
                x = float(line[30:38]); y = float(line[38:46]); z = float(line[46:54])
                coords.append((x,y,z))
                resid.append(int(line[22:26]))
                resname.append(line[17:20].strip())
    return np.array(coords, dtype=np.float32), resname, resid

will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint


  RNAlm.load_state_dict(torch.load(saved_model,map_location=torch.device('cpu')),strict=False)


In [10]:

###########################################################3
KAGGLE_TRUTH_PDB_DIR ='/kaggle/working/drfold/kaggle-casp15-truth'
USALIGN = '/kaggle/working/USalign' 
os.system('cp /kaggle/input/usalign/USalign /kaggle/working/')
os.system('sudo chmod u+x /kaggle/working/USalign')

# evaluate helper
def get_truth_df(target_id, label_df):
    truth_df = label_df[label_df['target_id'] == target_id]
    truth_df = truth_df.reset_index(drop=True)
    return truth_df

def parse_usalign_for_tm_score(output):
    # Extract TM-score based on length of reference structure (second)
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    if not tm_score_match:
        raise ValueError('No TM score found')
    return float(tm_score_match)

def parse_usalign_for_transform(output):
    # Locate the rotation matrix section
    matrix_lines = []
    found_matrix = False

    for line in output.splitlines():
        if "The rotation matrix to rotate Structure_1 to Structure_2" in line:
            found_matrix = True
        elif found_matrix and re.match(r'^\d+\s+[-\d.]+\s+[-\d.]+\s+[-\d.]+\s+[-\d.]+$', line):
            matrix_lines.append(line)
        elif found_matrix and not line.strip():
            break  # Stop parsing if an empty line is encountered after the matrix

    # Parse the rotation matrix values
    rotation_matrix = []
    for line in matrix_lines:
        parts = line.split()
        row_values = list(map(float, parts[1:]))  # Skip the first column (index)
        rotation_matrix.append(row_values)
    return np.array(rotation_matrix)



# data helper
def make_data(seq):
    aa_type = parse_seq(seq)
    base = Get_base(seq, BASE_COOR)
    seq_idx = np.arange(len(seq)) + 1

    msa = aa_type[None, :]
    msa = torch.from_numpy(msa)
    msa = torch.cat([msa, msa], 0) #???
    msa = F.one_hot(msa.long(), 6).float()

    base_x  = torch.from_numpy(base).float()
    seq_idx = torch.from_numpy(seq_idx).long()
    return msa, base_x, seq_idx
    
def make_dummy_solution():
    solution=dotdict()
    for i, row in valid_df.iterrows():
        target_id = row.target_id
        sequence = row.sequence
        solution[target_id]=dotdict(
            target_id=target_id,
            sequence=sequence,
            coord=[],
        )
    return solution

def solution_to_submit_df(solution):
    submit_df = []
    for k,s in solution.items():
        df = coord_to_df(s.sequence, s.coord, s.target_id)
        submit_df.append(df)
    
    submit_df = pd.concat(submit_df)
    return submit_df
 

def coord_to_df(sequence, coord, target_id):
    L = len(sequence)
    df = pd.DataFrame()
    df['ID'] = [f'{target_id}_{i + 1}' for i in range(L)]
    df['resname'] = [s for s in sequence]
    df['resid'] = [i + 1 for i in range(L)]

    num_coord = len(coord)
    for j in range(num_coord):
        df[f'x_{j+1}'] = coord[j][:, 0]
        df[f'y_{j+1}'] = coord[j][:, 1]
        df[f'z_{j+1}'] = coord[j][:, 2]
    return df

In [11]:
################### start here !!! #######################################################3
out_dir = '/kaggle/working/model-output'
os.makedirs(out_dir, exist_ok=True)
solution = make_dummy_solution()


#load model (these are moified versions, not the same from their github repo)
rnalm = RNA2nd(dict(
    s_in_dim=5,
    z_in_dim=2,
    s_dim= 512,
    z_dim= 128,
    N_elayers=18,
))
rnalm_file = '/kaggle/working/drfold/model_hub/RCLM/epoch_67000'
print(rnalm_file)
print(
    rnalm.load_state_dict(torch.load(rnalm_file, map_location='cpu', weights_only=True), strict=False)
    #Unexpected key(s) in state_dict: "ss_head.linear.weight", "ss_head.linear.bias".
)
rnalm = rnalm.to(DEVICE)
rnalm = rnalm.eval()
total_time_taken = 0
max_gpu_mem_used = 0

will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
/kaggle/working/drfold/model_hub/RCLM/epoch_67000
_IncompatibleKeys(missing_keys=[], unexpected_keys=['ss_head.linear.weight', 'ss_head.linear.bias'])


In [12]:
def write_frame_coor_to_pdb(coord, seq, savefile):
    """
    coord: np.ndarray of shape (L,3,3) from your model (P, sugar, N)
    seq:   string of length L
    savefile: path to write PDB with only C1' atom per residue
    """
    L = coord.shape[0]
    with open(savefile, 'w') as f:
        count = 1
        for i, res in enumerate(seq):
            x, y, z = coord[i, 1]   # channel=1 → sugar atom
            # PDB ATOM line building:
            # atom serial, atom name, residue name, chain A, residue seq, x,y,z, occup,temp, element
            f.write(
                f"ATOM  {count:5d}  C1' {res:>3s} A{ i+1:4d}"
                f"{x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00           C\n"
            )
            count += 1
        f.write("TER\n")


In [13]:
cfg = dict(
    seq_dim=6,
    msa_dim=7,
    N_ensemble=10,   # how many ensemble members
    N_cycle=8,      # how many recycling cycles
    m_dim=64,
    s_dim=64,
    z_dim=64,
)
for c in range(NUM_CONF): 
    msa2xyz = MSA2XYZ(**cfg)
    msa2xyz_file = [
        f'/kaggle/working/drfold/model_hub/cfg_97/model_{k}' for k in [0,1,2,8,9]
    ][c]
    print(msa2xyz_file)
    print(
        msa2xyz.load_state_dict(torch.load(msa2xyz_file, map_location='cpu', weights_only=True), strict=True)
    )
    msa2xyz.msaxyzone.premsa.rnalm = rnalm
    msa2xyz = msa2xyz.to(DEVICE)
    msa2xyz = msa2xyz.eval()
 
    for i,row in valid_df.iterrows():
        start_timer = timer()
        
        target_id = row.target_id
        sequence = row.sequence
        seq = row.sequence    
        
        L = len(sequence)
        if L>MAX_LENGTH:
            i0 = np.random.choice(L-MAX_LENGTH+1)
            i1 = i0 + MAX_LENGTH
        else:
            i0 = 0
            i1 = L
        
        seq = sequence[i0:i1]
        print(c,i,target_id, L, seq[:75]+'...')
        
        msa, base_x, seq_idx = make_data(seq)
        msa, base_x, seq_idx = msa.to(DEVICE), base_x.to(DEVICE), seq_idx.to(DEVICE)
        secondary = None #secondary structure
    
        with torch.no_grad(): 
            out = msa2xyz.pred(msa, seq_idx, secondary, base_x, np.array(list(seq)))

        # key = list(out.keys()) # plddt(L,L), coor(L,3,3), dist_p(L,L,38), dist_c, dist_n,
        # for k in key:
        #     print(k, type(out[k]), out[k].shape)
 
        
        if L!=len(seq):
             out['coor'] = np.pad(out['coor'] ,((i0, L - i1), (0, 0), (0, 0)), 'constant', constant_values=0)


        print('out:',  out['coor'].shape)
        time_taken = timer()-start_timer
        total_time_taken += time_taken
        print('time_taken:', time_to_str(time_taken, mode='sec')) 
        
        gpu_mem_used = gpu_memory_use()
        max_gpu_mem_used = max(max_gpu_mem_used,gpu_mem_used)
        print('gpu_mem_used:', gpu_mem_used, 'GB')

        torch.cuda.empty_cache() 
        sugar_xyz = out['coor'][:, 1, :]   # shape (L,3)
        solution[target_id].coord.append(sugar_xyz)
    print('')
    
#-----end of conformation generation ----
print('MAX_LENGTH', MAX_LENGTH)
print('### total_time_taken:', time_to_str(total_time_taken, mode='min'))
print('### max_gpu_mem_used:', max_gpu_mem_used, 'GB')
print('')

submit_df = solution_to_submit_df(solution)
submit_df.to_csv(f'submission.csv', index=False)
print(submit_df)
print('SUBMIT OK!!!!!!')
print('')


will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
/kaggle/working/drfold/model_hub/cfg_97/model_0
<All keys matched successfully>
0 0 R1107 69 GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUUGCACUCCGGCUGCGAAUUCUGCU...


  return fn(*args, **kwargs)


out: (69, 3, 3)
time_taken:  0 min 12 sec
gpu_mem_used: 2 GB
0 1 R1108 69 GGGGGCCACAGCAGAAGCGUUCACGUCGCGGCCCCUGUCAGCCAUUGCACUCCGGCUGCGAAUUCUGCU...
out: (69, 3, 3)
time_taken:  0 min 10 sec
gpu_mem_used: 2 GB
0 2 R1116 157 CGCCCGGAUAGCUCAGUCGGUAGAGCAGCGGCUAAAACAGCUCUGGGGUUGUACCCACCCCAGAGGCCCACGUGG...
out: (157, 3, 3)
time_taken:  0 min 31 sec
gpu_mem_used: 3 GB
0 3 R1117v2 30 UUGGGUUCCCUCACCCCAAUCAUAAAAAGG...
out: (30, 3, 3)
time_taken:  0 min 10 sec
gpu_mem_used: 2 GB
0 4 R1126 363 GGAAUCUCGCCCGAUGUUCGCAUCGGGAUUUGCAGGUCCAUGGAUUACACCAUGCAACGCAGACCUGUAGAUGCC...
out: (363, 3, 3)
time_taken:  4 min 10 sec
gpu_mem_used: 8 GB
0 5 R1128 238 GGAAUAUCGUCAUGGUGAUUCGUCACCAUGAGGCUAGAUCUCAUAUCUAGCGCUUUCGAGCGCUAGAGUCCUUAU...
out: (238, 3, 3)
time_taken:  1 min 20 sec
gpu_mem_used: 4 GB
0 6 R1136 374 GGAUACGUCUACGCUCAGUGACGGACUCUCUUCGGAGAGUCUGACAUCCGAACCAUACACGGAUGUGCCUCGCCG...
out: (374, 3, 3)
time_taken:  4 min 24 sec
gpu_mem_used: 8 GB
0 7 R1138 720 GCGGGCGUAUAGGUUCGUCUAUACGUCCGCGUUUUCCGAGAAGAGGUA

In [14]:
# train_dataset = SequenceStructureDataset(
#     csv_file=f"{DATA_KAGGLE_DIR}/train_labels.csv",
#     data_dir=f"{DATA_KAGGLE_DIR}/train_data"
# )
# val_dataset = SequenceStructureDataset(
#     csv_file=f"{DATA_KAGGLE_DIR}/validation_labels.csv",
#     data_dir=f"{DATA_KAGGLE_DIR}/validation_data"
# )

# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4)

# # from cfg_97.RNALM2.Model import RNA2nd as DRfold2Model  # adjust import as needed

# # model = DRfold2Model(rnacfg)
# # # Load pre-trained weights
# # msa2xyz_file = [f'/kaggle/working/drfold/model_hub/cfg_97/model_0' for k in [0,1,2,8,9]][0]
# model = rnalm
# model.load_state_dict(torch.load(msa2xyz_file, map_location='cpu', weights_only=True), strict=True)
# model.train()

# # Freeze embedding layers if desired
# for name, param in model.named_parameters():
#     if 'embed' in name:
#         param.requires_grad = False

# optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=1e-5)
# scheduler = CosineAnnealingLR(optimizer, T_max=10)
# criterion = nn.MSELoss()

# num_epochs = 20
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# for epoch in range(1, num_epochs + 1):
#     # Training
#     model.train()
#     total_loss = 0
#     for batch in train_loader:
#         features = batch['features'].to(device)
#         coords_target = batch['coords'].to(device)

#         optimizer.zero_grad()
#         coords_pred = model(features)
#         loss = criterion(coords_pred, coords_target)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     avg_train_loss = total_loss / len(train_loader)

#     # Validation
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             features = batch['features'].to(device)
#             coords_target = batch['coords'].to(device)
#             coords_pred = model(features)
#             val_loss += criterion(coords_pred, coords_target).item()
#     avg_val_loss = val_loss / len(val_loader)

#     scheduler.step()

#     print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

#     # Save checkpoint
#     if epoch % 5 == 0:
#         torch.save(model.state_dict(), f"drfold2_finetuned_epoch{epoch}.pth")

In [15]:
submit_df = solution_to_submit_df(solution)
submit_df.to_csv(f'submission.csv', index=False)
print(submit_df)
print('SUBMIT OK!!!!!!')
print('')

            ID resname  resid        x_1        y_1        z_1        x_2        y_2        z_2        x_3        y_3        z_3        x_4        y_4        z_4        x_5        y_5       z_5
0      R1107_1       G      1   2.905267   9.048628  -3.282411  -7.015918   8.532943   4.970207  -0.031350   8.395054  -9.734588   1.462059  -1.383294   4.998859  -2.708893   2.232720 -1.934017
1      R1107_2       G      2   6.057236  12.926416  -0.379654  -8.077373   9.264426  10.573361   2.114906  12.400799  -9.890106  -0.719784  -2.968459   0.928596  -6.980391   0.893952 -4.024602
2      R1107_3       G      3   9.028996  14.280936   4.500532 -10.581871   7.324004  15.353767   2.510819  18.012100  -8.438994  -5.177915  -3.207253  -2.163895 -12.186717   0.791450 -4.414519
3      R1107_4       G      4  11.440684  12.405815   9.389330 -12.854498   2.791572  18.680662   0.575659  22.589180  -5.405528 -10.183083  -1.089534  -3.836257 -17.234041   2.204441 -2.606580
4      R1107_5       G      5 

In [16]:
# !rm -rf /kaggle/working/drfold
# !rm -rf /kaggle/working/model-output
# !rm -rf /kaggle/working/USalign

In [17]:
# if 0: 
#     print('debug: show first perdict')
#     solution = list(solution.values())
#     s = solution[0]
#     target_id = s.target_id
    
#     fig = plt.figure(figsize=(10, 10))
#     ax = fig.add_subplot(111, projection='3d')
 
#     truth_df  = get_truth_df(target_id, label_df)
#     truth_pdb = f'{KAGGLE_TRUTH_PDB_DIR}/kaggle_truth_{target_id}_C1.pdb' 
#     # print(os.path.isfile(truth_pdb))

#     for c in range(5):
#         predict_pdb = f'{out_dir}/{target_id}-coor.{c}.pdb'
#         # print(os.path.isfile(predict_pdb))
        
#         if MODE=='local':
#             command = f'{USALIGN} {predict_pdb} {truth_pdb} -atom " C1\'" -m -'
#             output = os.popen(command).read()
#             tm_score = parse_usalign_for_tm_score(output)
#             transform = parse_usalign_for_transform(output)
#             aligned = s.coord[c]@transform[:,1:].T + transform[:,[0]].T

#             #---
#             if c==0:
#                 truth = truth_df[['x_1', 'y_1', 'z_1']].to_numpy().astype('float32')
#                 x, y, z = truth[:, 0], truth[:, 1], truth[:, 2]
#                 ax.scatter(x, y, z, c='black', s=30, alpha=1)
#                 ax.plot(x, y, z, color='black', linewidth=1, alpha=1, label=f'truth')
#         else:
#             aligned = s.coord[c]
#             tm_score ='?'

#         x, y, z = aligned[:, 0], aligned[:, 1], aligned[:, 2]
#         alpha =1 if c==0 else 0.2
#         ax.scatter(x, y, z, c='RED', s=30, alpha=alpha)
#         ax.plot(x, y, z, color='RED', linewidth=1, alpha=alpha, label=f'{c}: tm {tm_score}')
        
#     set_aspect_equal(ax)
#     plt.legend()
#     plt.show() 
#     plt.close()

In [18]:
# if MODE=='local':
#     # local validation
 
#     tm_score=[]
#     for i,row in valid_df.iterrows(): 
#         target_id = row.target_id#'R1116' #casp15 R1116: len(157)
#         seq = row.sequence 
#         #-----------------------------------------------
#         print(i,target_id, len(seq), seq[:75]+'...')
    
#         truth_pdb = f'{KAGGLE_TRUTH_PDB_DIR}/kaggle_truth_{target_id}_C1.pdb'
#         # print(os.path.isfile(truth_pdb))
        
#         tm = []
#         for c in range(NUM_CONF):
#             predict_pdb = f'{out_dir}/{target_id}-coor.{c}.pdb'
#             # print(os.path.isfile(predict_pdb))
        
#             command = f'{USALIGN} {predict_pdb} {truth_pdb} -atom " C1\'" -m -'
#             output = os.popen(command).read()
#             # print(output)
#             try:
#                 tm_c = parse_usalign_for_tm_score(output)
#             except:
#                 tm_c = 0
#             tm.append(tm_c)
#         print('### tm:', tm)
#         tm_score.append(max(tm))
    
#     print('ALL\n',tm_score)
#     print('MEAN', np.array(tm_score).mean())



In [19]:
# from torch.utils.data import Dataset, DataLoader
# from torch.optim import AdamW
# from torch.optim.lr_scheduler import CosineAnnealingLR
# from pathlib import Path

# # If the drfold package isn't installed, add its directory to PYTHONPATH
# import sys
# # Adjust the path below to where the drfold repo lives in your environment
# sys.path.append('/kaggle/working/drfold')

# # Import the DRfold2 model
# from cfg_97.RNALM2.Model import RNA2nd as DRfold2Model


# class SequenceStructureDataset(Dataset):
#     def __init__(self, csv_file, data_dir, transform=None):
#         self.df = pd.read_csv(csv_file)
#         self.data_dir = Path(data_dir)
#         self.transform = transform

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         seq_id = row['ID']
#         # Load input features (e.g., 1-hot encoding of sequence)
#         features = np.load(self.data_dir / f"{seq_id}_features.npy")
#         # Load target coordinates
#         coords = np.load(self.data_dir / f"{seq_id}_coords.npy")
#         sample = {
#             'features': torch.from_numpy(features).float(),
#             'coords': torch.from_numpy(coords).float()
#         }
#         if self.transform:
#             sample = self.transform(sample)
#         return sample

# train_dataset = SequenceStructureDataset(
#     csv_file=f"{DATA_KAGGLE_DIR}/train_labels.csv",
#     data_dir=f"{DATA_KAGGLE_DIR}/train_data"
# )
# val_dataset = SequenceStructureDataset(
#     csv_file=f"{DATA_KAGGLE_DIR}/validation_labels.csv",
#     data_dir=f"{DATA_KAGGLE_DIR}/validation_data"
# )

# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4)

# # from drfold.model import DRfold2Model  # adjust import as needed

# model = DRfold2Model(dict(
#     s_in_dim=5,
#     z_in_dim=2,
#     s_dim= 512,
#     z_dim= 128,
#     N_elayers=18,
# ))
# # Load pre-trained weights
# msa2xyz_file = [f'/kaggle/working/drfold/model_hub/cfg_97/model_{k}' for k in [0,1,2,8,9]][c]
# model.load_state_dict(torch.load(msa2xyz_file, map_location='cpu', weights_only=True), strict=True)
# model.train()

# # Freeze embedding layers if desired
# for name, param in model.named_parameters():
#     if 'embed' in name:
#         param.requires_grad = False

# optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=1e-5)
# scheduler = CosineAnnealingLR(optimizer, T_max=10)
# criterion = nn.MSELoss()

# num_epochs = 20
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# for epoch in range(1, num_epochs + 1):
#     # Training
#     model.train()
#     total_loss = 0
#     for batch in train_loader:
#         features = batch['features'].to(device)
#         coords_target = batch['coords'].to(device)

#         optimizer.zero_grad()
#         coords_pred = model(features)
#         loss = criterion(coords_pred, coords_target)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     avg_train_loss = total_loss / len(train_loader)

#     # Validation
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             features = batch['features'].to(device)
#             coords_target = batch['coords'].to(device)
#             coords_pred = model(features)
#             val_loss += criterion(coords_pred, coords_target).item()
#     avg_val_loss = val_loss / len(val_loader)

#     scheduler.step()

#     print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

#     # Save checkpoint
#     if epoch % 5 == 0:
#         torch.save(model.state_dict(), f"drfold2_finetuned_epoch{epoch}.pth")


In [20]:
# # Install Optuna if not already installed
# !pip install optuna

# import optuna  # Using pure Optuna without PyTorchLightningPruningCallback

# def objective(trial):
#     # Suggest hyperparameters
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
#     wd = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
#     batch_size = trial.suggest_categorical('batch_size', [4, 8, 16])
#     T_max = trial.suggest_int('T_max', 5, 20)

#     # Update DataLoaders
#     train_loader = DataLoader(
#         train_dataset, batch_size=batch_size, shuffle=True, num_workers=4
#     )
#     val_loader = DataLoader(
#         val_dataset, batch_size=batch_size, shuffle=False, num_workers=4
#     )

#     # Reset model, optimizer, scheduler
#     model = DRfold2Model()
#     model.load_state_dict(torch.load(
#         "/kaggle/working/drfold/model_hub/drfold2_pretrained.pth"
#     ))
#     model.to(device)
#     optimizer = AdamW(
#         filter(lambda p: p.requires_grad, model.parameters()),
#         lr=lr, weight_decay=wd
#     )
#     scheduler = CosineAnnealingLR(optimizer, T_max=T_max)
#     criterion = nn.MSELoss()

#     # Training for a few epochs
#     for epoch in range(1, 6):  # short run for tuning
#         model.train()
#         for batch in train_loader:
#             features = batch['features'].to(device)
#             coords_target = batch['coords'].to(device)
#             optimizer.zero_grad()
#             coords_pred = model(features)
#             loss = criterion(coords_pred, coords_target)
#             loss.backward()
#             optimizer.step()
#         scheduler.step()

#     # Validation
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             features = batch['features'].to(device)
#             coords_target = batch['coords'].to(device)
#             coords_pred = model(features)
#             val_loss += criterion(coords_pred, coords_target).item()
#     avg_val_loss = val_loss / len(val_loader)
#     return avg_val_loss

In [21]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)

# print("Best trial:")
# print(study.best_trial.params)