In [1]:
import pandas as pd

# 定义文件路径
file_paths = {
    'KCAT': {
        'cv0': 'all_result_KCAT_cv0.txt',
        'cv1': 'all_result_KCAT_cv1.txt',
        'cv2': 'all_result_KCAT_cv2.txt',
        'cv3': 'all_result_KCAT_cv3.txt',
        'cv4': 'all_result_KCAT_cv4.txt'
    },
    'KM': {
        'cv0': 'all_result_KM_cv0.txt',
        'cv1': 'all_result_KM_cv1.txt',
        'cv2': 'all_result_KM_cv2.txt',
        'cv3': 'all_result_KM_cv3.txt',
        'cv4': 'all_result_KM_cv4.txt'
    },
    'KKM': {
        'cv0': 'all_result_KKM_cv0.txt',
        'cv1': 'all_result_KKM_cv1.txt',
        'cv2': 'all_result_KKM_cv2.txt',
        'cv3': 'all_result_KKM_cv3.txt',
        'cv4': 'all_result_KKM_cv4.txt'
    }
}

def process_file(file_path, suffix):
    """
    处理单个文件，提取模型名称和R²值，并去掉后缀。
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    for i in range(0, len(lines), 2):  # 每次跳两行
        model_line = lines[i].strip()  # 模型名称行
        metric_line = lines[i + 1].strip()  # 性能指标行

        # 提取模型名称并去掉后缀
        model_name = model_line.replace('./', '').replace('.csv', '').replace(suffix, '')

        # 提取R²值
        r2_value = None
        r2_index = metric_line.find('r2:')  # 找到'r2:'的位置
        if r2_index != -1:  # 如果找到'r2:'
            r2_str = metric_line[r2_index + 3:].strip()  # 提取'r2:'后面的部分并去掉空格
            if r2_str:  # 检查是否为空字符串
                try:
                    r2_value = float(r2_str)  # 尝试转换为浮点数
                except ValueError:
                    print(f"Warning: Could not convert R² value '{r2_str}' to float for model {model_name}")

        if r2_value is not None:
            data.append((model_name, r2_value))

    # 创建DataFrame
    df = pd.DataFrame(data, columns=['Model', 'R2'])

    return df

# 初始化存储每个类型的 R² 值的字典
type_r2_data = {
    'KCAT': {},
    'KM': {},
    'KKM': {}
}

# 处理每个文件
for file_type, cv_files in file_paths.items():
    for cv, file_path in cv_files.items():
        # 提取后缀（如 '_KCAT'）
        suffix = f'_{file_type}'
        # 处理文件
        df = process_file(file_path, suffix)
        # 将结果存储到 type_r2_data 中
        for _, row in df.iterrows():
            model = row['Model']
            r2 = row['R2']
            if model not in type_r2_data[file_type]:
                type_r2_data[file_type][model] = []
            type_r2_data[file_type][model].append(r2)

# 计算每个类型的 R² 均值并排序
results = {}
for file_type, models_r2 in type_r2_data.items():
    # 计算每个模型的 R² 均值
    mean_r2 = {model: sum(r2_list) / len(r2_list) for model, r2_list in models_r2.items()}
    # 创建 DataFrame
    df_mean_r2 = pd.DataFrame(list(mean_r2.items()), columns=['Model', 'Mean R2'])
    # 按 R² 均值排序并计算排名
    df_mean_r2 = df_mean_r2.sort_values(by='Mean R2', ascending=False)
    df_mean_r2['Rank'] = df_mean_r2['Mean R2'].rank(ascending=False, method='min')
    # 存储结果
    results[file_type] = df_mean_r2
# print(models_r2)
# 打印结果
for file_type, df in results.items():
    print(f"Results for {file_type}:")
    print(df)
    print("\n")
type_r2_data

Results for KCAT:
                  Model   Mean R2  Rank
32        unimolv2_esmc  0.688272   1.0
7       unimolv2_prott5  0.686004   2.0
39        molebert_esmc  0.684448   3.0
10    chemberta2_prott5  0.684179   4.0
33       unimolv2_esm1b  0.683986   5.0
1       molebert_prott5  0.683535   6.0
16      chemberta2_esmc  0.683284   7.0
14          molgen_esmc  0.683021   8.0
13        unimolv2_esm2  0.682458   9.0
27     chemberta2_esm1b  0.682409  10.0
36          ecfp_prott5  0.682327  11.0
31        unimolv1_esmc  0.681507  12.0
18            ecfp_esmc  0.680119  13.0
29       molebert_esm1b  0.680102  14.0
23        molebert_esm2  0.679999  15.0
34         molgen_esm1b  0.679793  16.0
28      chemberta2_esm2  0.679408  17.0
21       unimolv1_esm1b  0.678745  18.0
4         molgen_prott5  0.678709  19.0
2       unimolv1_prott5  0.678258  20.0
38          molgen_esm2  0.677372  21.0
37        smitrans_esmc  0.676903  22.0
11           ecfp_esm1b  0.676823  23.0
42            ecfp_esm

{'KCAT': {'rdkitfp_esm1b': [0.6534576772940399,
   0.6685847997687141,
   0.6677909135418898,
   0.6504343024174565,
   0.6485445792766384],
  'molebert_prott5': [0.6879731434131401,
   0.6881043681511636,
   0.6895752140331182,
   0.6768772893984878,
   0.6751444140220295],
  'unimolv1_prott5': [0.6784466264762907,
   0.6829957800083677,
   0.68207755816972,
   0.6761871793195445,
   0.6715833287222719],
  'ecfp_prollama': [0.6314149000446794,
   0.6452956117162864,
   0.627760097514554,
   0.6239219034614929,
   0.6298357272981581],
  'molgen_prott5': [0.6845285057819588,
   0.6813479560411171,
   0.6769275742173078,
   0.680276865477803,
   0.6704623409050932],
  'maccskeys_esm1b': [0.651399315260256,
   0.6575254834159037,
   0.6531641312481382,
   0.6429563460652662,
   0.6416185090122029],
  'molgen_prollama': [0.6157287931427279,
   0.6252957458452372,
   0.6185633260705713,
   0.6223577286039528,
   0.625496582774633],
  'unimolv2_prott5': [0.6847453906759576,
   0.687681447425

In [3]:
# 提取每个类型的DataFrame并重命名列
df_kcat = results['KCAT'][['Model', 'Mean R2', 'Rank']].rename(columns={'Mean R2': 'KCAT_Mean_R2', 'Rank': 'KCAT_Rank'})
df_km = results['KM'][['Model', 'Mean R2', 'Rank']].rename(columns={'Mean R2': 'KM_Mean_R2', 'Rank': 'KM_Rank'})
df_kkm = results['KKM'][['Model', 'Mean R2', 'Rank']].rename(columns={'Mean R2': 'KKM_Mean_R2', 'Rank': 'KKM_Rank'})

# 合并三个DataFrame，确保包含所有模型
merged = df_kcat.merge(df_km, on='Model', how='outer').merge(df_kkm, on='Model', how='outer')

# 计算平均R²和平均Rank
merged['Avg_Mean_R2'] = merged[['KCAT_Mean_R2', 'KM_Mean_R2', 'KKM_Mean_R2']].mean(axis=1)
merged['Avg_Rank'] = merged[['KCAT_Rank', 'KM_Rank', 'KKM_Rank']].mean(axis=1)

# 生成综合排名（按平均Rank升序，平均R²降序）
merged = merged.sort_values(by=['Avg_Rank', 'Avg_Mean_R2'], ascending=[True, False])
merged['Overall_Rank'] = merged['Avg_Rank'].rank(method='min', ascending=True).astype(int)

# 整理列顺序
final_columns = [
    'Model', 
    'KCAT_Mean_R2', 'KM_Mean_R2', 'KKM_Mean_R2', 'Avg_Mean_R2',
    'KCAT_Rank', 'KM_Rank', 'KKM_Rank', 'Avg_Rank', 'Overall_Rank'
]
merged = merged[final_columns]

print("\nOverall Performance Across All Types:")
print(merged.to_string(index=False))

# 保存结果到CSV文件（可选）
merged.to_csv('overall_performance.csv', index=False)


Overall Performance Across All Types:
              Model  KCAT_Mean_R2  KM_Mean_R2  KKM_Mean_R2  Avg_Mean_R2  KCAT_Rank  KM_Rank  KKM_Rank  Avg_Rank  Overall_Rank
    unimolv2_prott5      0.686004    0.665471     0.595427     0.648967        2.0      1.0       6.0  3.000000             1
    molebert_prott5      0.683535    0.662031     0.599937     0.648501        6.0      5.0       1.0  4.000000             2
     unimolv2_esm1b      0.683986    0.664192     0.591957     0.646712        5.0      2.0      12.0  6.333333             3
     molebert_esm1b      0.680102    0.660553     0.595955     0.645537       14.0      8.0       3.0  8.333333             4
      molebert_esmc      0.684448    0.654684     0.596343     0.645158        3.0     21.0       2.0  8.666667             5
      unimolv2_esmc      0.688272    0.657376     0.588497     0.644715        1.0     15.0      16.0 10.666667             6
      molgen_prott5      0.678709    0.661539     0.593857     0.644701       1

In [3]:
# 提取每个类型的DataFrame并重命名列
df_kcat = results['KCAT'][['Model', 'Mean R2', 'Rank']].rename(columns={'Mean R2': 'KCAT_Mean_R2', 'Rank': 'KCAT_Rank'})
df_km = results['KM'][['Model', 'Mean R2', 'Rank']].rename(columns={'Mean R2': 'KM_Mean_R2', 'Rank': 'KM_Rank'})
df_kkm = results['KKM'][['Model', 'Mean R2', 'Rank']].rename(columns={'Mean R2': 'KKM_Mean_R2', 'Rank': 'KKM_Rank'})

# 合并三个DataFrame，确保包含所有模型
merged = df_kcat.merge(df_km, on='Model', how='outer').merge(df_kkm, on='Model', how='outer')

# 计算平均R²和平均Rank
merged['Avg_Mean_R2'] = merged[['KCAT_Mean_R2', 'KM_Mean_R2', 'KKM_Mean_R2']].mean(axis=1)
merged['Avg_Rank'] = merged[['KCAT_Rank', 'KM_Rank', 'KKM_Rank']].mean(axis=1)

# 对 R² 值保留三位小数
merged['KCAT_Mean_R2'] = merged['KCAT_Mean_R2'].round(3)
merged['KM_Mean_R2'] = merged['KM_Mean_R2'].round(3)
merged['KKM_Mean_R2'] = merged['KKM_Mean_R2'].round(3)
merged['Avg_Mean_R2'] = merged['Avg_Mean_R2'].round(3)

# 生成综合排名（按平均Rank升序，平均R²降序）
merged = merged.sort_values(by=['Avg_Rank', 'Avg_Mean_R2'], ascending=[True, False])

# merged['Overall_Rank'] = merged['Avg_Rank'].rank(method='min', ascending=True).astype(int)
################################## 可选 ###################################
merged['Overall_Rank'] = merged['Avg_Mean_R2'].rank(method='min', ascending=False).astype(int)

# 整理列顺序
final_columns = [
    'Model', 
    'KCAT_Mean_R2', 'KM_Mean_R2', 'KKM_Mean_R2', 'Avg_Mean_R2',
    'KCAT_Rank', 'KM_Rank', 'KKM_Rank', 'Avg_Rank', 'Overall_Rank'
]
merged = merged[final_columns]

print("\nOverall Performance Across All Types:")
print(merged.to_string(index=False))

# 保存结果到CSV文件（可选）
merged.to_csv('overall_performance.csv', index=False)


Overall Performance Across All Types:
              Model  KCAT_Mean_R2  KM_Mean_R2  KKM_Mean_R2  Avg_Mean_R2  KCAT_Rank  KM_Rank  KKM_Rank  Avg_Rank  Overall_Rank
    unimolv2_prott5         0.686       0.665        0.595        0.649        2.0      1.0       6.0  3.000000             1
    molebert_prott5         0.684       0.662        0.600        0.649        6.0      5.0       1.0  4.000000             1
     unimolv2_esm1b         0.684       0.664        0.592        0.647        5.0      2.0      12.0  6.333333             3
     molebert_esm1b         0.680       0.661        0.596        0.646       14.0      8.0       3.0  8.333333             4
      molebert_esmc         0.684       0.655        0.596        0.645        3.0     21.0       2.0  8.666667             5
      molgen_prott5         0.679       0.662        0.594        0.645       19.0      6.0       7.0 10.666667             5
      unimolv2_esmc         0.688       0.657        0.588        0.645        

In [2]:
import pandas as pd
import re

# 定义文件路径
file_paths = {
    'KCAT': {
        'cv0': 'all_bianyifangxiang_KCAT_cv0.txt',
        'cv1': 'all_bianyifangxiang_KCAT_cv1.txt',
        'cv2': 'all_bianyifangxiang_KCAT_cv2.txt',
        'cv3': 'all_bianyifangxiang_KCAT_cv3.txt',
        'cv4': 'all_bianyifangxiang_KCAT_cv4.txt'
    },
    'KM': {
        'cv0': 'all_bianyifangxiang_KM_cv0.txt',
        'cv1': 'all_bianyifangxiang_KM_cv1.txt',
        'cv2': 'all_bianyifangxiang_KM_cv2.txt',
        'cv3': 'all_bianyifangxiang_KM_cv3.txt',
        'cv4': 'all_bianyifangxiang_KM_cv4.txt'
    },
    'KKM': {
        'cv0': 'all_bianyifangxiang_KKM_cv0.txt',
        'cv1': 'all_bianyifangxiang_KKM_cv1.txt',
        'cv2': 'all_bianyifangxiang_KKM_cv2.txt',
        'cv3': 'all_bianyifangxiang_KKM_cv3.txt',
        'cv4': 'all_bianyifangxiang_KKM_cv4.txt'
    }
}

def process_file(file_path):
    """
    处理单个文件，提取模型名称和指标值（Accuracy, Precision, Recall, F1）。
    """
    with open(file_path, 'r') as file:
        content = file.read()

    # 正则表达式匹配模型名称和指标值
    pattern = re.compile(r'(\w+): Accuracy: ([\d.]+) Precision: ([\d.]+) Recall: ([\d.]+) F1: ([\d.]+)')
    matches = pattern.findall(content)

    # 提取数据
    data = []
    for match in matches:
        model = match[0]
        accuracy = float(match[1])
        precision = float(match[2])
        recall = float(match[3])
        f1 = float(match[4])
        data.append((model, accuracy, precision, recall, f1))

    # 创建DataFrame
    df = pd.DataFrame(data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
    return df

# 初始化存储每个类型的指标值的字典
type_metrics_data = {
    'KCAT': {},
    'KM': {},
    'KKM': {}
}

# 处理每个文件
for file_type, cv_files in file_paths.items():
    for cv, file_path in cv_files.items():
        # 处理文件
        df = process_file(file_path)
        # 将结果存储到 type_metrics_data 中
        for _, row in df.iterrows():
            model = row['Model']
            metrics = row[['Accuracy', 'Precision', 'Recall', 'F1']].tolist()
            if model not in type_metrics_data[file_type]:
                type_metrics_data[file_type][model] = []
            type_metrics_data[file_type][model].append(metrics)

# 计算每个类型的指标均值并排序
results = {}
for file_type, models_metrics in type_metrics_data.items():
    # 计算每个模型的指标均值
    mean_metrics = {}
    for model, metrics_list in models_metrics.items():
        # 计算每个指标的均值
        mean_accuracy = sum([m[0] for m in metrics_list]) / len(metrics_list)
        mean_precision = sum([m[1] for m in metrics_list]) / len(metrics_list)
        mean_recall = sum([m[2] for m in metrics_list]) / len(metrics_list)
        mean_f1 = sum([m[3] for m in metrics_list]) / len(metrics_list)
        mean_metrics[model] = [mean_accuracy, mean_precision, mean_recall, mean_f1]

    # 创建 DataFrame
    df_mean_metrics = pd.DataFrame(list(mean_metrics.items()), columns=['Model', 'Mean Metrics'])
    # 展开 Mean Metrics 列
    df_mean_metrics[['Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1']] = pd.DataFrame(df_mean_metrics['Mean Metrics'].tolist(), index=df_mean_metrics.index)
    df_mean_metrics = df_mean_metrics.drop(columns=['Mean Metrics'])

    # 按 Mean Accuracy 排序并计算排名
    df_mean_metrics = df_mean_metrics.sort_values(by='Mean Accuracy', ascending=False)
    df_mean_metrics['Rank'] = df_mean_metrics['Mean Accuracy'].rank(ascending=False, method='min')

    # 存储结果
    results[file_type] = df_mean_metrics

# 打印结果
for file_type, df in results.items():
    print(f"Results for {file_type}:")
    print(df)
    print("\n")

# 保存结果到 CSV 文件（可选）
# for file_type, df in results.items():
#     df.to_csv(f'mean_metrics_rank_{file_type}.csv', index=False)
df

Results for KCAT:
                       Model  Mean Accuracy  Mean Precision  Mean Recall  \
13      molebert_prott5_KCAT       0.883779        0.754972     0.670750   
33          ecfp_prott5_KCAT       0.883191        0.747540     0.679613   
11        molebert_esm2_KCAT       0.882700        0.752396     0.667477   
31            ecfp_esm2_KCAT       0.880938        0.745874     0.666140   
38       rdkitfp_prott5_KCAT       0.880913        0.739608     0.677121   
10       molebert_esm1b_KCAT       0.880907        0.752252     0.654728   
12        molebert_esmc_KCAT       0.880729        0.746701     0.663293   
36         rdkitfp_esm2_KCAT       0.880034        0.743916     0.663226   
35        rdkitfp_esm1b_KCAT       0.879620        0.741898     0.663553   
30           ecfp_esm1b_KCAT       0.878945        0.740912     0.661770   
32            ecfp_esmc_KCAT       0.878855        0.740492     0.660987   
43     maccskeys_prott5_KCAT       0.877667        0.731330     0.6702

Unnamed: 0,Model,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Rank
18,unimolv1_prott5_KKM,0.859771,0.739102,0.700065,0.718981,1.0
8,molgen_prott5_KKM,0.857469,0.728624,0.705804,0.716891,2.0
25,chemberta2_esm1b_KKM,0.856827,0.727335,0.709128,0.71796,3.0
26,chemberta2_esm2_KKM,0.856705,0.726506,0.707033,0.716437,4.0
15,unimolv1_esm1b_KKM,0.856227,0.725257,0.707241,0.715959,5.0
3,smitrans_prott5_KKM,0.854665,0.723514,0.701292,0.712152,6.0
2,smitrans_esmc_KKM,0.854631,0.729042,0.689474,0.70828,7.0
5,molgen_esm1b_KKM,0.854497,0.726771,0.692703,0.709212,8.0
28,chemberta2_prott5_KKM,0.8544,0.723468,0.69909,0.710759,9.0
23,unimolv2_prott5_KKM,0.854121,0.724015,0.697277,0.710153,10.0


In [2]:
import pandas as pd
import re
import ast

# 定义文件路径
file_paths = {
    'KCAT': {
        'cv0': 'all_bianyiweidian_KCAT_cv0.txt',
        'cv1': 'all_bianyiweidian_KCAT_cv1.txt',
        'cv2': 'all_bianyiweidian_KCAT_cv2.txt',
        'cv3': 'all_bianyiweidian_KCAT_cv3.txt',
        'cv4': 'all_bianyiweidian_KCAT_cv4.txt'
    },
    'KM': {
        'cv0': 'all_bianyiweidian_KM_cv0.txt',
        'cv1': 'all_bianyiweidian_KM_cv1.txt',
        'cv2': 'all_bianyiweidian_KM_cv2.txt',
        'cv3': 'all_bianyiweidian_KM_cv3.txt',
        'cv4': 'all_bianyiweidian_KM_cv4.txt'
    },
    'KKM': {
        'cv0': 'all_bianyiweidian_KKM_cv0.txt',
        'cv1': 'all_bianyiweidian_KKM_cv1.txt',
        'cv2': 'all_bianyiweidian_KKM_cv2.txt',
        'cv3': 'all_bianyiweidian_KKM_cv3.txt',
        'cv4': 'all_bianyiweidian_KKM_cv4.txt'
    }
}

def process_file(file_path):
    """
    处理单个文件，提取模型名称和字典值。
    """
    with open(file_path, 'r') as file:
        content = file.read()

    # 正则表达式匹配模型名称和字典值
    pattern = re.compile(r'(\w+):\s*({.*?})')
    matches = pattern.findall(content)

    # 提取数据
    data = []
    for match in matches:
        model = match[0]
        try:
            dict_values = ast.literal_eval(match[1])  # 将字符串转换为字典
            data.append((model, dict_values))
        except (ValueError, SyntaxError):
            print(f"Warning: Could not parse dictionary for model {model} in file {file_path}")

    # 创建DataFrame
    df = pd.DataFrame(data, columns=['Model', 'Values'])
    return df

# 初始化存储每个类型的指标值的字典
type_metrics_data = {
    'KCAT': {},
    'KM': {},
    'KKM': {}
}

# 处理每个文件
for file_type, cv_files in file_paths.items():
    for cv, file_path in cv_files.items():
        # 处理文件
        df = process_file(file_path)
        # 将结果存储到 type_metrics_data 中
        for _, row in df.iterrows():
            model = row['Model']
            values = row['Values']
            if model not in type_metrics_data[file_type]:
                type_metrics_data[file_type][model] = []
            type_metrics_data[file_type][model].append(values)

# 计算每个类型的指标均值并排序
results = {}
for file_type, models_metrics in type_metrics_data.items():
    # 初始化存储每个键的均值
    mean_metrics = {key: {} for key in range(1, 7)}  # 1-6 的键

    # 计算每个模型的每个键的均值
    for model, metrics_list in models_metrics.items():
        for key in range(1, 7):
            # 提取当前键的所有值
            key_values = [metrics.get(key, None) for metrics in metrics_list]
            key_values = [v for v in key_values if v is not None]  # 过滤掉 None 值
            if key_values:  # 确保列表不为空
                mean_value = sum(key_values) / len(key_values)
                mean_metrics[key][model] = mean_value
            else:
                print(f"Warning: No valid values found for key {key} of model {model} in {file_type}")
                mean_metrics[key][model] = None  # 如果没有有效值，设置为 None

    # 对每个键的结果进行排序并计算排名
    ranked_results = {}
    for key, model_values in mean_metrics.items():
        # 创建 DataFrame
        df_mean_metrics = pd.DataFrame(list(model_values.items()), columns=['Model', f'Mean Value (Key {key})'])
        # 按 Mean Value 排序并计算排名
        df_mean_metrics = df_mean_metrics.sort_values(by=f'Mean Value (Key {key})', ascending=False)
        df_mean_metrics[f'Rank (Key {key})'] = df_mean_metrics[f'Mean Value (Key {key})'].rank(ascending=False, method='min')
        ranked_results[key] = df_mean_metrics

    # 存储结果
    results[file_type] = ranked_results

# 打印结果
for file_type, ranked_results in results.items():
    print(f"Results for {file_type}:")
    for key, df in ranked_results.items():
        print(f"\nKey {key}:")
        print(df)
    print("\n")

# 保存结果到 CSV 文件（可选）
for file_type, ranked_results in results.items():
    for key, df in ranked_results.items():
        df.to_csv(f'mean_values_rank_{file_type}_key{key}.csv', index=False)

Results for KCAT:

Key 1:
                       Model  Mean Value (Key 1)  Rank (Key 1)
22        unimolv2_esmc_KCAT            0.766154           1.0
27      chemberta2_esmc_KCAT            0.760972           2.0
7           molgen_esmc_KCAT            0.758348           3.0
23      unimolv2_prott5_KCAT            0.755732           4.0
28    chemberta2_prott5_KCAT            0.753947           5.0
2         smitrans_esmc_KCAT            0.751760           6.0
21        unimolv2_esm2_KCAT            0.749836           7.0
20       unimolv2_esm1b_KCAT            0.746023           8.0
26      chemberta2_esm2_KCAT            0.745054           9.0
17        unimolv1_esmc_KCAT            0.744836          10.0
8         molgen_prott5_KCAT            0.743927          11.0
25     chemberta2_esm1b_KCAT            0.741990          12.0
6           molgen_esm2_KCAT            0.741362          13.0
32            ecfp_esmc_KCAT            0.741034          14.0
3       smitrans_prott5_KCAT 

wild:

In [1]:
import pandas as pd

# 定义文件路径
file_paths = {
    'KCAT': {
        'cv0': 'all_result_KCAT_cv0_wild.txt',
        'cv1': 'all_result_KCAT_cv1_wild.txt',
        'cv2': 'all_result_KCAT_cv2_wild.txt',
        'cv3': 'all_result_KCAT_cv3_wild.txt',
        'cv4': 'all_result_KCAT_cv4_wild.txt'
    },
    'KM': {
        'cv0': 'all_result_KM_cv0_wild.txt',
        'cv1': 'all_result_KM_cv1_wild.txt',
        'cv2': 'all_result_KM_cv2_wild.txt',
        'cv3': 'all_result_KM_cv3_wild.txt',
        'cv4': 'all_result_KM_cv4_wild.txt'
    },
    'KKM': {
        'cv0': 'all_result_KKM_cv0_wild.txt',
        'cv1': 'all_result_KKM_cv1_wild.txt',
        'cv2': 'all_result_KKM_cv2_wild.txt',
        'cv3': 'all_result_KKM_cv3_wild.txt',
        'cv4': 'all_result_KKM_cv4_wild.txt'
    }
}

def process_file(file_path, suffix):
    """
    处理单个文件，提取模型名称和R²值，并去掉后缀。
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    for i in range(0, len(lines), 2):  # 每次跳两行
        model_line = lines[i].strip()  # 模型名称行
        metric_line = lines[i + 1].strip()  # 性能指标行

        # 提取模型名称并去掉后缀
        model_name = model_line.replace('./', '').replace('.csv', '').replace(suffix, '')

        # 提取R²值
        r2_value = None
        r2_index = metric_line.find('r2:')  # 找到'r2:'的位置
        if r2_index != -1:  # 如果找到'r2:'
            r2_str = metric_line[r2_index + 3:].strip()  # 提取'r2:'后面的部分并去掉空格
            if r2_str:  # 检查是否为空字符串
                try:
                    r2_value = float(r2_str)  # 尝试转换为浮点数
                except ValueError:
                    print(f"Warning: Could not convert R² value '{r2_str}' to float for model {model_name}")

        if r2_value is not None:
            data.append((model_name, r2_value))

    # 创建DataFrame
    df = pd.DataFrame(data, columns=['Model', 'R2'])

    return df

# 初始化存储每个类型的 R² 值的字典
type_r2_data = {
    'KCAT': {},
    'KM': {},
    'KKM': {}
}

# 处理每个文件
for file_type, cv_files in file_paths.items():
    for cv, file_path in cv_files.items():
        # 提取后缀（如 '_KCAT'）
        suffix = f'_{file_type}'
        # 处理文件
        df = process_file(file_path, suffix)
        # 将结果存储到 type_r2_data 中
        for _, row in df.iterrows():
            model = row['Model']
            r2 = row['R2']
            if model not in type_r2_data[file_type]:
                type_r2_data[file_type][model] = []
            type_r2_data[file_type][model].append(r2)

# 计算每个类型的 R² 均值并排序
results = {}
for file_type, models_r2 in type_r2_data.items():
    # 计算每个模型的 R² 均值
    mean_r2 = {model: sum(r2_list) / len(r2_list) for model, r2_list in models_r2.items()}
    # 创建 DataFrame
    df_mean_r2 = pd.DataFrame(list(mean_r2.items()), columns=['Model', 'Mean R2'])
    # 按 R² 均值排序并计算排名
    df_mean_r2 = df_mean_r2.sort_values(by='Mean R2', ascending=False)
    df_mean_r2['Rank'] = df_mean_r2['Mean R2'].rank(ascending=False, method='min')
    # 存储结果
    results[file_type] = df_mean_r2

# 打印结果
for file_type, df in results.items():
    print(f"Results for {file_type}:")
    print(df)
    print("\n")


Results for KCAT:
                  Model   Mean R2  Rank
1       molebert_prott5  0.621410   1.0
29       molebert_esm1b  0.621166   2.0
23        molebert_esm2  0.617258   3.0
39        molebert_esmc  0.616782   4.0
36          ecfp_prott5  0.613393   5.0
21       unimolv1_esm1b  0.613019   6.0
33       unimolv2_esm1b  0.611651   7.0
27     chemberta2_esm1b  0.611450   8.0
34         molgen_esm1b  0.611333   9.0
11           ecfp_esm1b  0.609621  10.0
7       unimolv2_prott5  0.609571  11.0
10    chemberta2_prott5  0.608450  12.0
2       unimolv1_prott5  0.607479  13.0
42            ecfp_esm2  0.606886  14.0
25        unimolv1_esm2  0.606755  15.0
4         molgen_prott5  0.606175  16.0
32        unimolv2_esmc  0.605992  17.0
31        unimolv1_esmc  0.605640  18.0
13        unimolv2_esm2  0.605629  19.0
18            ecfp_esmc  0.605032  20.0
14          molgen_esmc  0.603746  21.0
38          molgen_esm2  0.603691  22.0
28      chemberta2_esm2  0.603638  23.0
41       smitrans_esm1

mutant：

In [1]:
import pandas as pd

# 定义文件路径
file_paths = {
    'KCAT': {
        'cv0': 'all_result_KCAT_cv0_mutant.txt',
        'cv1': 'all_result_KCAT_cv1_mutant.txt',
        'cv2': 'all_result_KCAT_cv2_mutant.txt',
        'cv3': 'all_result_KCAT_cv3_mutant.txt',
        'cv4': 'all_result_KCAT_cv4_mutant.txt'
    },
    'KM': {
        'cv0': 'all_result_KM_cv0_mutant.txt',
        'cv1': 'all_result_KM_cv1_mutant.txt',
        'cv2': 'all_result_KM_cv2_mutant.txt',
        'cv3': 'all_result_KM_cv3_mutant.txt',
        'cv4': 'all_result_KM_cv4_mutant.txt'
    },
    'KKM': {
        'cv0': 'all_result_KKM_cv0_mutant.txt',
        'cv1': 'all_result_KKM_cv1_mutant.txt',
        'cv2': 'all_result_KKM_cv2_mutant.txt',
        'cv3': 'all_result_KKM_cv3_mutant.txt',
        'cv4': 'all_result_KKM_cv4_mutant.txt'
    }
}

def process_file(file_path, suffix):
    """
    处理单个文件，提取模型名称和R²值，并去掉后缀。
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    for i in range(0, len(lines), 2):  # 每次跳两行
        model_line = lines[i].strip()  # 模型名称行
        metric_line = lines[i + 1].strip()  # 性能指标行

        # 提取模型名称并去掉后缀
        model_name = model_line.replace('./', '').replace('.csv', '').replace(suffix, '')

        # 提取R²值
        r2_value = None
        r2_index = metric_line.find('r2:')  # 找到'r2:'的位置
        if r2_index != -1:  # 如果找到'r2:'
            r2_str = metric_line[r2_index + 3:].strip()  # 提取'r2:'后面的部分并去掉空格
            if r2_str:  # 检查是否为空字符串
                try:
                    r2_value = float(r2_str)  # 尝试转换为浮点数
                except ValueError:
                    print(f"Warning: Could not convert R² value '{r2_str}' to float for model {model_name}")

        if r2_value is not None:
            data.append((model_name, r2_value))

    # 创建DataFrame
    df = pd.DataFrame(data, columns=['Model', 'R2'])

    return df

# 初始化存储每个类型的 R² 值的字典
type_r2_data = {
    'KCAT': {},
    'KM': {},
    'KKM': {}
}

# 处理每个文件
for file_type, cv_files in file_paths.items():
    for cv, file_path in cv_files.items():
        # 提取后缀（如 '_KCAT'）
        suffix = f'_{file_type}'
        # 处理文件
        df = process_file(file_path, suffix)
        # 将结果存储到 type_r2_data 中
        for _, row in df.iterrows():
            model = row['Model']
            r2 = row['R2']
            if model not in type_r2_data[file_type]:
                type_r2_data[file_type][model] = []
            type_r2_data[file_type][model].append(r2)

# 计算每个类型的 R² 均值并排序
results = {}
for file_type, models_r2 in type_r2_data.items():
    # 计算每个模型的 R² 均值
    mean_r2 = {model: sum(r2_list) / len(r2_list) for model, r2_list in models_r2.items()}
    # 创建 DataFrame
    df_mean_r2 = pd.DataFrame(list(mean_r2.items()), columns=['Model', 'Mean R2'])
    # 按 R² 均值排序并计算排名
    df_mean_r2 = df_mean_r2.sort_values(by='Mean R2', ascending=False)
    df_mean_r2['Rank'] = df_mean_r2['Mean R2'].rank(ascending=False, method='min')
    # 存储结果
    results[file_type] = df_mean_r2

# 打印结果
for file_type, df in results.items():
    print(f"Results for {file_type}:")
    print(df)
    print("\n")


Results for KCAT:
                  Model   Mean R2  Rank
32        unimolv2_esmc  0.769704   1.0
16      chemberta2_esmc  0.764432   2.0
14          molgen_esmc  0.760403   3.0
7       unimolv2_prott5  0.760116   4.0
10    chemberta2_prott5  0.757208   5.0
13        unimolv2_esm2  0.756856   6.0
37        smitrans_esmc  0.756657   7.0
31        unimolv1_esmc  0.754538   8.0
33       unimolv2_esm1b  0.752927   9.0
28      chemberta2_esm2  0.752246  10.0
18            ecfp_esmc  0.752055  11.0
27     chemberta2_esm1b  0.749531  12.0
38          molgen_esm2  0.747543  13.0
39        molebert_esmc  0.747537  14.0
4         molgen_prott5  0.747478  15.0
36          ecfp_prott5  0.746848  16.0
9       smitrans_prott5  0.745533  17.0
2       unimolv1_prott5  0.744858  18.0
12        smitrans_esm2  0.743756  19.0
34         molgen_esm1b  0.743625  20.0
25        unimolv1_esm2  0.742260  21.0
42            ecfp_esm2  0.742257  22.0
1       molebert_prott5  0.739717  23.0
41       smitrans_esm1