In [2]:
import shutil
import os

# Copy USalign to working directory and make it executable
shutil.copy2("/kaggle/input/usalign/USalign", "/kaggle/working/USalign")
os.chmod("/kaggle/working/USalign", 0o755)

print("USalign copied to /kaggle/working/ and made executable")

USalign copied to /kaggle/working/ and made executable


In [3]:
# naive

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("/kaggle/input/naive-set/naive.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.19283, 0.20135, 0.17652, 0.22244, 0.09725], best = 0.2224
8KEB_A: TM-scores per model = [0.53878, 0.56837, 0.55819, 0.44725, 0.42738], best = 0.5684
8KHH_A: TM-scores per model = [0.55576, 0.55964, 0.51214, 0.45304, 0.40885], best = 0.5596
8QHU_3: TM-scores per model = [0.30432, 0.18159, 0.2365, 0.40593, 0.3978], best = 0.4059
8QHU_4: TM-scores per model = [0.29675, 0.27557, 0.33704, 0.34278, 0.3384], best = 0.3428
8QHU_5: TM-scores per model = [0.19478, 0.25382, 0.24297, 0.35873, 0.19861], best = 0.3587
8QHU_7: TM-scores per model = [0.19318, 0.20109, 0.17621, 0.42896, 0.43153], best = 0.4315
8QHU_S4: TM-scores per model = [0.61168, 0.58314, 0.60851, 0.40743, 0.39412], best = 0.6117
8R7N_A: TM-scores per model = [0.25364, 0.21215, 0.2257, 0.19094, 0.18381], best = 0.2536
8RRI_Ax: TM-scores per model = [0.39478, 0.45784, 0.45127, 0.4211, 0.41621], best = 0.4578
8RWG_C: TM-scores per model = [0.57823, 0.55041, 0.53331, 0.5503

In [10]:
# Adaptive weighted

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("/kaggle/input/ensembles-3/adaptive_weighted.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.22598, 0.20073, 0.2101, 0.17677, 0.19867], best = 0.2260
8KEB_A: TM-scores per model = [0.55093, 0.56856, 0.56488, 0.57248, 0.54548], best = 0.5725
8KHH_A: TM-scores per model = [0.54022, 0.55522, 0.49642, 0.54339, 0.50184], best = 0.5552
8QHU_3: TM-scores per model = [0.15271, 0.16569, 0.15955, 0.18246, 0.1672], best = 0.1825
8QHU_4: TM-scores per model = [0.22441, 0.34995, 0.17174, 0.14506, 0.13521], best = 0.3499
8QHU_5: TM-scores per model = [0.20834, 0.20043, 0.15006, 0.16691, 0.15653], best = 0.2083
8QHU_7: TM-scores per model = [0.12649, 0.15388, 0.13532, 0.15589, 0.15645], best = 0.1565
8QHU_S4: TM-scores per model = [0.59403, 0.54273, 0.62019, 0.61177, 0.61777], best = 0.6202
8R7N_A: TM-scores per model = [0.13994, 0.118, 0.12256, 0.1387, 0.16107], best = 0.1611
8RRI_Ax: TM-scores per model = [0.36546, 0.39188, 0.41472, 0.45608, 0.36369], best = 0.4561
8RWG_C: TM-scores per model = [0.54086, 0.52187, 0.4926, 0.5017,

In [14]:
# Best confidence

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("//kaggle/input/ensembles/best_confidence-3.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.22244, 0.09725, 0.18596, 0.09026, 0.15321], best = 0.2224
8KEB_A: TM-scores per model = [0.44725, 0.42738, 0.4476, 0.43691, 0.43828], best = 0.4476
8KHH_A: TM-scores per model = [0.45304, 0.40885, 0.42596, 0.45148, 0.4357], best = 0.4530
8QHU_3: TM-scores per model = [0.30432, 0.18159, 0.2365, 0.26916, 0.23286], best = 0.3043
8QHU_4: TM-scores per model = [0.34278, 0.3384, 0.36206, 0.34957, 0.32973], best = 0.3621
8QHU_5: TM-scores per model = [0.35873, 0.19861, 0.32727, 0.21138, 0.31396], best = 0.3587
8QHU_7: TM-scores per model = [0.42896, 0.43153, 0.43018, 0.18577, 0.43203], best = 0.4320
8QHU_S4: TM-scores per model = [0.40743, 0.39412, 0.4106, 0.41055, 0.40373], best = 0.4106
8R7N_A: TM-scores per model = [0.19094, 0.18381, 0.18992, 0.199, 0.20713], best = 0.2071
8RRI_Ax: TM-scores per model = [0.4211, 0.41621, 0.4224, 0.4334, 0.4417], best = 0.4417
8RWG_C: TM-scores per model = [0.55031, 0.54939, 0.55035, 0.55571, 0.5

In [4]:
# Confidence Threshold

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("/kaggle/input/ensembles-2/confidence_threshold-2.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.19283, 0.20135, 0.17652, 0.2035, 0.19651], best = 0.2035
8KEB_A: TM-scores per model = [0.53878, 0.56837, 0.55819, 0.58718, 0.52438], best = 0.5872
8KHH_A: TM-scores per model = [0.55576, 0.55964, 0.51214, 0.54621, 0.52171], best = 0.5596
8QHU_3: TM-scores per model = [0.30432, 0.18159, 0.2365, 0.26916, 0.23286], best = 0.3043
8QHU_4: TM-scores per model = [0.29675, 0.27557, 0.33704, 0.30269, 0.29188], best = 0.3370
8QHU_5: TM-scores per model = [0.19478, 0.25382, 0.24297, 0.17552, 0.18794], best = 0.2538
8QHU_7: TM-scores per model = [0.19318, 0.20109, 0.17621, 0.19974, 0.19262], best = 0.2011
8QHU_S4: TM-scores per model = [0.61168, 0.58314, 0.60851, 0.61033, 0.60317], best = 0.6117
8R7N_A: TM-scores per model = [0.25364, 0.21215, 0.2257, 0.2358, 0.24817], best = 0.2536
8RRI_Ax: TM-scores per model = [0.39478, 0.45784, 0.45127, 0.46007, 0.4393], best = 0.4601
8RWG_C: TM-scores per model = [0.57823, 0.55041, 0.53331, 0.5537

In [3]:
# Dynamic

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("/kaggle/input/ensembles-2/dynamic-2.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.19283, 0.20135, 0.17652, 0.2035, 0.19651], best = 0.2035
8KEB_A: TM-scores per model = [0.53878, 0.56837, 0.55819, 0.58718, 0.52438], best = 0.5872
8KHH_A: TM-scores per model = [0.55576, 0.55964, 0.51214, 0.54621, 0.52171], best = 0.5596
8QHU_3: TM-scores per model = [0.1594, 0.15583, 0.13434, 0.17686, 0.2231], best = 0.2231
8QHU_4: TM-scores per model = [0.29675, 0.27557, 0.33704, 0.30269, 0.29188], best = 0.3370
8QHU_5: TM-scores per model = [0.19478, 0.25382, 0.24297, 0.17552, 0.18794], best = 0.2538
8QHU_7: TM-scores per model = [0.11846, 0.16259, 0.12925, 0.15618, 0.19863], best = 0.1986
8QHU_S4: TM-scores per model = [0.61168, 0.58314, 0.60851, 0.61033, 0.60317], best = 0.6117
8R7N_A: TM-scores per model = [0.25364, 0.21215, 0.2257, 0.2358, 0.24817], best = 0.2536
8RRI_Ax: TM-scores per model = [0.39478, 0.45784, 0.45127, 0.46007, 0.4393], best = 0.4601
8RWG_C: TM-scores per model = [0.57823, 0.55041, 0.53331, 0.55372

In [9]:
# Weighted average

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("/kaggle/input/ensembles/weighted_avg.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.08131, 0.10415, 0.11447, 0.09555, 0.16591], best = 0.1659
8KEB_A: TM-scores per model = [0.24431, 0.27911, 0.1101, 0.1635, 0.08634], best = 0.2791
8KHH_A: TM-scores per model = [0.11567, 0.11921, 0.12751, 0.12287, 0.12452], best = 0.1275
8QHU_3: TM-scores per model = [0.1594, 0.15583, 0.13434, 0.17686, 0.2231], best = 0.2231
8QHU_4: TM-scores per model = [0.17423, 0.25454, 0.14261, 0.16907, 0.13075], best = 0.2545
8QHU_5: TM-scores per model = [0.21038, 0.20804, 0.13949, 0.20012, 0.18945], best = 0.2104
8QHU_7: TM-scores per model = [0.11846, 0.16259, 0.12925, 0.15618, 0.19863], best = 0.1986
8QHU_S4: TM-scores per model = [0.12487, 0.0793, 0.19357, 0.16577, 0.2463], best = 0.2463
8R7N_A: TM-scores per model = [0.10073, 0.14412, 0.12871, 0.06992, 0.13818], best = 0.1441
8RRI_Ax: TM-scores per model = [0.11949, 0.1092, 0.10198, 0.2358, 0.22576], best = 0.2358
8RWG_C: TM-scores per model = [0.20703, 0.19141, 0.10919, 0.21566, 

In [8]:
# Simple average

import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/kaggle/working/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")
    
    # Calculate mean TM score
    all_best_scores = [max(scores) for scores in per_target.values()]
    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    
    return per_target, mean_tm

solution   = pd.read_csv(
    "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
)
submission = pd.read_csv("/kaggle/input/ensembles-3/simple_avg.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
print(f"\nMean TM-score: {mean_tm:.4f}")

Scoring 94 common targets...
8K85_A: TM-scores per model = [0.08131, 0.10415, 0.11447, 0.09555, 0.16591], best = 0.1659
8KEB_A: TM-scores per model = [0.24431, 0.27911, 0.1101, 0.1635, 0.08634], best = 0.2791
8KHH_A: TM-scores per model = [0.11567, 0.11921, 0.12751, 0.12287, 0.12452], best = 0.1275
8QHU_3: TM-scores per model = [0.17602, 0.15431, 0.14653, 0.16953, 0.17563], best = 0.1760
8QHU_4: TM-scores per model = [0.19511, 0.32952, 0.11985, 0.14104, 0.1239], best = 0.3295
8QHU_5: TM-scores per model = [0.21603, 0.17397, 0.13995, 0.13894, 0.13783], best = 0.2160
8QHU_7: TM-scores per model = [0.13157, 0.17513, 0.15984, 0.14342, 0.19421], best = 0.1942
8QHU_S4: TM-scores per model = [0.12487, 0.0793, 0.19357, 0.16577, 0.2463], best = 0.2463
8R7N_A: TM-scores per model = [0.12633, 0.13465, 0.09649, 0.12659, 0.1498], best = 0.1498
8RRI_Ax: TM-scores per model = [0.11949, 0.1092, 0.10198, 0.2358, 0.22576], best = 0.2358
8RWG_C: TM-scores per model = [0.20703, 0.19141, 0.10919, 0.21566, 