In [8]:
import os
import numpy as np
import math
from collections import defaultdict
import torch
import esm
from esm import Alphabet, FastaBatchedDataset, ProteinBertModel, pretrained, MSATransformer
import pathlib
import pandas as pd
from scipy.stats import pearsonr

In [9]:
def read_processed_sequences(a3m_file):
    """读取a3m文件并处理小写字母，返回与query等长的序列列表"""
    sequences = []
    with open(a3m_file, 'r') as f:
        current_seq = []
        for line in f:
            if line.startswith('>'):
                if current_seq:
                    # 合并多行序列并处理小写字母
                    full_seq = ''.join(current_seq)
                    processed = ''.join([c for c in full_seq if not c.islower()])
                    sequences.append(processed)
                    current_seq = []
            else:
                current_seq.append(line.strip())
        # 处理最后一个序列
        if current_seq:
            full_seq = ''.join(current_seq)
            processed = ''.join([c for c in full_seq if not c.islower()])
            sequences.append(processed)
    
    if not sequences:
        raise ValueError("未读取到有效序列")
    
    # 获取query长度并过滤异常序列
    query_len = len(sequences[0])
    # 使用列表推导式过滤（保留第一个query序列）
    filtered_sequences = [sequences[0]] + [seq for seq in sequences[1:] 
                                         if len(seq) == query_len]
    
    # 输出过滤统计信息
    original_count = len(sequences)
    filtered_count = len(filtered_sequences)
    if filtered_count < original_count:
        print(f"[提示] 已过滤 {original_count - filtered_count} 条长度不一致的序列，"
              f"保留 {filtered_count} 条有效序列")
    
    return filtered_sequences

def calculate_msa_entropy(sequences):
    """计算每个位点的熵值"""
    # 定义标准氨基酸集合
    amino_acids = {'A','R','N','D','C','Q','E','G','H','I',
                   'L','K','M','F','P','S','T','W','Y','V'}
    
    seq_len = len(sequences[0])
    entropy_scores = np.zeros(seq_len)
    
    for pos in range(seq_len):
        counts = defaultdict(int)
        valid_count = 0
        
        # 统计该位点氨基酸出现次数
        for seq in sequences:
            aa = seq[pos]
            if aa in amino_acids:
                counts[aa] += 1
                valid_count += 1
        
        # 计算熵值
        entropy = 0.0
        if valid_count > 0:
            for cnt in counts.values():
                p = cnt / valid_count
                entropy -= p * math.log2(p) if p > 0 else 0
        
        entropy_scores[pos] = entropy
    
    return entropy_scores

## 对比ris和msa

In [10]:
device = 'cuda:3'
model_location='/home2/kangboming/kangboming/workspace/PIC_revise/model/ESM/esm2_t33_650M_UR50D.pt'

### 加载ESM模型
model, alphabet = esm.pretrained.load_model_and_alphabet(model_location)
batch_converter = alphabet.get_batch_converter()

### 获得概率值：GPU版
def get_logits(seq, model,alphabet,batch_converter,device,format=None):
    AAorder=['R','K','H','E','D','N','Q','T','S','C','G','A','V','L','I','M','P','Y','F','W']
    data = [("_", seq), ("_", seq), ("_", seq), ("_", seq)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)  # Move batch_tokens to GPU
    model = model.to(device)  # Move model to GPU
    with torch.no_grad():
        logits = torch.softmax(model(batch_tokens, repr_layers=[33], return_contacts=False)["logits"], dim=-1).cpu().numpy()
        # return logits
    if format == 'pandas':
        WTlogits = pd.DataFrame(logits[0][1:-1, :], columns=alphabet.all_toks, index=list(seq)).T.iloc[4:24].loc[AAorder]
        WTlogits.columns = [j.split('.')[0] + '_' + str(i + 1) for i, j in enumerate(WTlogits.columns)]
        return WTlogits
    if format == 'array':
        WTlogits = pd.DataFrame(logits[0][1:-1, :], columns=alphabet.all_toks, index=list(seq)).T.iloc[4:24].loc[AAorder]
        WTlogits_array = WTlogits.values
        return WTlogits_array
    else:
        return logits[0][1:-1, :]

def get_logits_batch(seq_lst, model,alphabet,batch_converter,device,format=None):
    AAorder=['R','K','H','E','D','N','Q','T','S','C','G','A','V','L','I','M','P','Y','F','W']
    data = [("_", seq),]
    print(data)
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)  # Move batch_tokens to GPU
    model = model.to(device)  # Move model to GPU
    with torch.no_grad():
        logits = torch.softmax(model(batch_tokens, repr_layers=[33], return_contacts=False)["logits"], dim=-1).cpu().numpy()
    if format == 'pandas':
        WTlogits = pd.DataFrame(logits[0][1:-1, :], columns=alphabet.all_toks, index=list(seq)).T.iloc[4:24].loc[AAorder]
        WTlogits.columns = [j.split('.')[0] + '_' + str(i + 1) for i, j in enumerate(WTlogits.columns)]
        return WTlogits
    if format == 'array':
        WTlogits = pd.DataFrame(logits[0][1:-1, :], columns=alphabet.all_toks, index=list(seq)).T.iloc[4:24].loc[AAorder]
        WTlogits_array = WTlogits.values
        return WTlogits_array
    else:
        return logits[0][1:-1, :]

def calculate_esm_entropy(prob_matrix):

    # Ensure probabilities are non-zero to avoid log(0)
    prob_matrix = np.clip(prob_matrix, 1e-10, 1.0)
    
    # Calculate entropy for each position
    entropy = -np.sum(prob_matrix * np.log2(prob_matrix), axis=-1)
    return entropy

In [11]:
filePath = '/home/liuyu/codes/ifold/my_works/testset/alignment_test'
ali_lst = os.listdir(filePath)
ali_lst

['5eec_A',
 '6v0v_A',
 '5whq_A',
 '6khh_A',
 '3tg4_A',
 '5lg4_A',
 '6tdu_AG',
 '3vkw_A',
 '3f9p_C',
 '3vop_A',
 '5drn_A',
 '5c1t_A',
 '3oq9_H',
 '4uo4_A',
 '7cww_A',
 '7anm_aa',
 '2enw_A',
 '6bta_A',
 '2aqx_A',
 '4ntd_A',
 '3kt5_A',
 '2j6z_A',
 '6lqk_A',
 '5svb_A',
 '1faz_A',
 '4f6o_A',
 '6cfw_K',
 '3ls2_A',
 '6utl_A',
 '4bl9_A',
 '4bv4_R',
 '6zk9_d',
 '1j9w_A',
 '3kkj_A',
 '2l82_A',
 '1uzm_A',
 '3u5o_I',
 '3ilr_A',
 '3ru9_A',
 '4bxl_C',
 '3myo_A',
 '3opq_A',
 '6blb_A',
 '6ud1_A',
 '2iy0_A',
 '5jr1_H',
 '1plf_A',
 '7jr5_A',
 '4a47_A',
 '1jnv_Y',
 '6nmv_H',
 '1f0z_A',
 '2uy2_A',
 '5zch_A',
 '5ksb_I',
 '1fmh_B',
 '6y9e_A',
 '6zcf_A',
 '4qxg_H',
 '6cnb_Q',
 '1mt1_B',
 '6niq_A',
 '7cdc_C',
 '4r98_A',
 '2dax_A',
 '5n0b_A',
 '2y6s_C',
 '4yeu_A',
 '5ej1_A',
 '2i9n_A',
 '2qrx_A',
 '7amc_A',
 '6nov_A',
 '4raf_A',
 '2fca_A',
 '6dzm_A',
 '5me5_B',
 '3nax_A',
 '7nfx_t',
 '6n4d_A',
 '6e0b_A',
 '3jcu_H',
 '6bac_A',
 '3nxl_A',
 '4gfh_A',
 '4iqz_A',
 '6jlu_19',
 '7e5o_H',
 '3wsy_A',
 '2wpd_J',
 '7rlg_

In [None]:
import warnings
warnings.filterwarnings("ignore")

prot_name_lst = []
msa_ent_lst = []
esm_ent_lst = []
corr_lst = []
p_value_lst = []
for item in ali_lst:
    try:
        a3m_file = f'/home/liuyu/codes/ifold/my_works/testset/alignment_test/{item}/uniref90_hits.a3m'
        seq_msa = read_processed_sequences(a3m_file)
        msa_ent = calculate_msa_entropy(seq_msa)
        # print(msa_ent)
        esm_logit = get_logits(seq_msa[0], model, alphabet, batch_converter, device, format='array')
        esm_ent = calculate_esm_entropy(esm_logit.T)
        # print(esm_ent)
        corr, p_value = pearsonr(msa_ent, esm_ent)
        # print(corr)
        prot_name_lst.append(item)
        msa_ent_lst.append(msa_ent)
        esm_ent_lst.append(esm_ent)
        corr_lst.append(corr)
        p_value_lst.append(p_value)
    except:
        pass

In [22]:
res = pd.DataFrame({'prot_id':prot_name_lst, 'msa_entropy':msa_ent_lst, 'esm_entropy':esm_ent_lst, 'pearson_r':corr_lst, 'p_value':p_value_lst})
res

Unnamed: 0,prot_id,msa_entropy,esm_entropy,pearson_r,p_value
0,5eec_A,"[0.0, 0.537722259170694, 2.130368940614964, 3....","[0.9056432, 1.8913814, 3.2183812, 2.944435, 3....",0.456331,3.058298e-15
1,6v0v_A,"[0.9853665169344864, 0.011886566145789652, 1.9...","[0.3131475, 2.4706426, 1.4563936, 0.43207082, ...",-0.130439,2.716230e-04
2,5whq_A,"[0.0, 2.2359263506290326, 1.3605273114437082, ...","[0.24322398, 3.2649784, 1.509316, 1.6037657, 0...",0.606435,2.587472e-78
3,6khh_A,"[0.2920391072640456, 0.00854345017056622, 3.10...","[0.008397541, 0.0015327749, 1.0893201, 0.05473...",0.537745,1.044832e-15
4,3tg4_A,"[0.09140162014739274, 1.8119019864112724, 2.53...","[0.036959767, 2.5919728, 1.6678122, 1.2576972,...",0.435331,1.888823e-21
...,...,...,...,...,...
4804,3p99_A,"[0.0, 1.6534900296125528, 2.4685937569816416, ...","[1.7447037, 1.7684298, 1.0514115, 0.029275995,...",0.471970,1.652484e-26
4805,2f6b_A,"[0.0, 1.7858620215404153, 1.7564363637635219, ...","[1.5921417, 2.8477256, 2.767546, 1.9646392, 2....",0.445144,2.028807e-11
4806,1pjq_A,"[0.051402317293002264, 2.4462806823879477, 2.9...","[0.0009103721, 0.26597425, 1.3680799, 0.193920...",0.610368,5.432145e-48
4807,1ebg_A,"[0.0, 0.35123831041832787, 3.360250618578943, ...","[0.029140597, 1.2347147, 1.6438084, 0.49960944...",0.586729,1.108318e-41


In [23]:
df2 = pd.read_csv('/home/liuyu/codes/ifold/my_works/rmsd_compare_res.csv',usecols=['prot_id','prot_lenth','MSA_count'])
df2

Unnamed: 0,prot_id,prot_lenth,MSA_count
0,5fdu_1y,10,1
1,7arh_V,10,2
2,1lvb_C,10,1
3,5gic_C,10,1
4,3rwi_C,10,1
...,...,...,...
3594,3cmu_A,304,17711
3595,6qw6_5A,27,4500
3596,6g2i_A,166,12114
3597,6rw8_A,1319,12306


In [24]:
df3 = pd.merge(df2,res).dropna()
df3

Unnamed: 0,prot_id,prot_lenth,MSA_count,msa_entropy,esm_entropy,pearson_r,p_value
22,1mdi_B,13,20,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.9182958340544896, ...","[1.5165472, 2.3614123, 2.842379, 2.2770755, 3....",-0.097147,7.522124e-01
26,1mvu_P,13,1,"[0.9798687566511528, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.4798949, 1.8946745, 2.0342958, 1.3810273, 1...",0.521566,6.754653e-02
37,5v21_B,15,420,"[0.0, 0.0, 0.39314106803181825, 0.082617092512...","[1.9311681, 2.7289474, 1.7434655, 2.4443135, 1...",0.094354,7.380212e-01
50,3plu_C,18,466,"[0.0, 0.0, 0.0, 1.117278648903896, 1.102382950...","[2.4115262, 1.0662894, 0.46427396, 0.7137516, ...",-0.025769,9.166011e-01
57,6zqs_B,14,57,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7219280948873...","[2.41609, 1.8512884, 2.5281353, 2.6452045, 2.4...",0.298147,2.016804e-01
...,...,...,...,...,...,...,...
3594,3cmu_A,304,17711,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.811...","[1.9520988, 2.2837095, 3.6516654, 3.3118792, 0...",0.093066,2.439455e-05
3595,6qw6_5A,27,4500,"[0.8271595195340526, 2.140888035329084, 1.7690...","[0.16087885, 1.2384948, 1.3056793, 1.0283935, ...",0.073952,3.736236e-04
3596,6g2i_A,166,12114,"[0.0, 1.2357256642352388, 0.5242344512056485, ...","[1.6928067, 2.2653956, 2.6691034, 2.4984996, 1...",0.349298,2.772611e-68
3597,6rw8_A,1319,12306,"[0.0, 1.9532888733249165, 2.7397012717189235, ...","[3.0134244, 2.6460645, 2.175682, 2.6725206, 1....",0.336401,8.479259e-68


In [25]:
df4 = df3[(df3['prot_lenth']>50) & (df3['MSA_count']>1000)]
df4

Unnamed: 0,prot_id,prot_lenth,MSA_count,msa_entropy,esm_entropy,pearson_r,p_value
434,4v49_BF,52,1025,"[0.004604558453396438, 2.1313722582155985, 1.0...","[0.0070921807, 1.1334897, 0.16219354, 0.875433...",0.599165,2.693729e-06
435,1le8_A,53,1031,"[0.0, 1.4689955935892813, 2.57355111297219, 0....","[1.4378254, 1.5161511, 1.3347821, 0.95238936, ...",0.605589,1.552497e-06
437,1b2o_A,54,1043,"[0.14089074268693202, 2.7596801824407375, 2.31...","[0.0031979096, 0.26130733, 0.21698931, 0.14259...",0.571877,6.265527e-06
439,3esw_B,55,1050,"[0.0, 0.0, 0.8798196705627167, 0.1623261801753...","[2.0481043, 3.5872295, 0.91768605, 0.7643086, ...",0.265077,5.048213e-02
440,2ysa_A,55,1017,"[0.0, 2.4485148623399016, 2.494665468423249, 2...","[1.1628559, 0.3024863, 0.14397605, 0.59923226,...",0.195206,1.532297e-01
...,...,...,...,...,...,...,...
3593,7cu3_A,65,9582,"[0.0, 0.02341565053334974, 1.7140612787576388,...","[0.2502526, 1.5579014, 1.8002597, 0.7549762, 1...",0.490787,5.058310e-106
3594,3cmu_A,304,17711,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.811...","[1.9520988, 2.2837095, 3.6516654, 3.3118792, 0...",0.093066,2.439455e-05
3596,6g2i_A,166,12114,"[0.0, 1.2357256642352388, 0.5242344512056485, ...","[1.6928067, 2.2653956, 2.6691034, 2.4984996, 1...",0.349298,2.772611e-68
3597,6rw8_A,1319,12306,"[0.0, 1.9532888733249165, 2.7397012717189235, ...","[3.0134244, 2.6460645, 2.175682, 2.6725206, 1....",0.336401,8.479259e-68


In [27]:
import pickle
with open('msa_esm_ent.pickle', 'wb') as f:
    pickle.dump(df4, f)

In [28]:
with open('msa_esm_ent.pickle', 'rb') as f:
    data = pickle.load(f)
data

Unnamed: 0,prot_id,prot_lenth,MSA_count,msa_entropy,esm_entropy,pearson_r,p_value
434,4v49_BF,52,1025,"[0.004604558453396438, 2.1313722582155985, 1.0...","[0.0070921807, 1.1334897, 0.16219354, 0.875433...",0.599165,2.693729e-06
435,1le8_A,53,1031,"[0.0, 1.4689955935892813, 2.57355111297219, 0....","[1.4378254, 1.5161511, 1.3347821, 0.95238936, ...",0.605589,1.552497e-06
437,1b2o_A,54,1043,"[0.14089074268693202, 2.7596801824407375, 2.31...","[0.0031979096, 0.26130733, 0.21698931, 0.14259...",0.571877,6.265527e-06
439,3esw_B,55,1050,"[0.0, 0.0, 0.8798196705627167, 0.1623261801753...","[2.0481043, 3.5872295, 0.91768605, 0.7643086, ...",0.265077,5.048213e-02
440,2ysa_A,55,1017,"[0.0, 2.4485148623399016, 2.494665468423249, 2...","[1.1628559, 0.3024863, 0.14397605, 0.59923226,...",0.195206,1.532297e-01
...,...,...,...,...,...,...,...
3593,7cu3_A,65,9582,"[0.0, 0.02341565053334974, 1.7140612787576388,...","[0.2502526, 1.5579014, 1.8002597, 0.7549762, 1...",0.490787,5.058310e-106
3594,3cmu_A,304,17711,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.811...","[1.9520988, 2.2837095, 3.6516654, 3.3118792, 0...",0.093066,2.439455e-05
3596,6g2i_A,166,12114,"[0.0, 1.2357256642352388, 0.5242344512056485, ...","[1.6928067, 2.2653956, 2.6691034, 2.4984996, 1...",0.349298,2.772611e-68
3597,6rw8_A,1319,12306,"[0.0, 1.9532888733249165, 2.7397012717189235, ...","[3.0134244, 2.6460645, 2.175682, 2.6725206, 1....",0.336401,8.479259e-68


In [29]:
data_new = data.drop(['msa_entropy', 'esm_entropy'], axis=1)
data_new

Unnamed: 0,prot_id,prot_lenth,MSA_count,pearson_r,p_value
434,4v49_BF,52,1025,0.599165,2.693729e-06
435,1le8_A,53,1031,0.605589,1.552497e-06
437,1b2o_A,54,1043,0.571877,6.265527e-06
439,3esw_B,55,1050,0.265077,5.048213e-02
440,2ysa_A,55,1017,0.195206,1.532297e-01
...,...,...,...,...,...
3593,7cu3_A,65,9582,0.490787,5.058310e-106
3594,3cmu_A,304,17711,0.093066,2.439455e-05
3596,6g2i_A,166,12114,0.349298,2.772611e-68
3597,6rw8_A,1319,12306,0.336401,8.479259e-68


In [30]:
data_new.to_csv('msa_esm_ent_noarray.csv', index=False)

In [33]:
from scipy.stats import pearsonr
total_corr, total_p_value = pearsonr(np.concatenate(tuple(data['msa_entropy'])), np.concatenate(tuple(data['esm_entropy'])))
print('Total pearsonR: ', total_corr, total_p_value, len(np.concatenate(tuple(data['msa_entropy']))))

Total pearsonR:  0.37619867851975686 0.0 877683


In [15]:
total_corr, total_p_value = pearsonr(np.concatenate(tuple(msa_ent_lst)), np.concatenate(tuple(esm_ent_lst)))
print('Total pearsonR: ', total_corr)

Total pearsonR:  0.25088975584863205


In [8]:
res.to_csv('msa_esm_entropy.csv',index=False)

In [10]:
prot_len = [len(item) for item in msa_ent_lst]
prot_len

[269,
 775,
 768,
 191,
 433,
 196,
 306,
 446,
 467,
 64,
 216,
 391,
 100,
 328,
 368,
 74,
 177,
 165,
 289,
 340,
 203,
 111,
 204,
 776,
 122,
 350,
 173,
 280,
 216,
 756,
 446,
 380,
 260,
 336,
 162,
 247,
 22,
 370,
 351,
 22,
 333,
 163,
 355,
 136,
 226,
 236,
 72,
 490,
 63,
 135,
 223,
 66,
 294,
 328,
 11,
 33,
 307,
 103,
 220,
 251,
 113,
 267,
 9,
 271,
 152,
 1335,
 217,
 312,
 728,
 33,
 235,
 129,
 226,
 367,
 213,
 311,
 90,
 311,
 136,
 397,
 401,
 73,
 379,
 475,
 1178,
 250,
 166,
 244,
 678,
 76,
 821,
 221,
 65,
 182,
 291,
 335,
 138,
 339,
 271,
 258,
 550,
 139,
 381,
 329,
 165,
 309,
 122,
 558,
 149,
 367,
 59,
 391,
 402,
 485,
 20,
 47,
 221,
 265,
 587,
 409,
 760,
 70,
 81,
 76,
 5,
 124,
 266,
 100,
 274,
 4,
 169,
 144,
 139,
 513,
 208,
 46,
 146,
 199,
 95,
 43,
 230,
 167,
 434,
 66,
 316,
 180,
 235,
 165,
 230,
 353,
 211,
 48,
 237,
 404,
 113,
 385,
 5,
 253,
 200,
 135,
 118,
 500,
 255,
 161,
 332,
 971,
 13,
 266,
 478,
 501,
 850,
 562,


In [12]:
res_noarray = pd.DataFrame({'prot_id':prot_name_lst, 'prot_lenth':prot_len, 'pearson_r':corr_lst, 'p_value':p_value_lst})
res_noarray

Unnamed: 0,prot_id,prot_lenth,pearson_r,p_value
0,5eec_A,269,0.456331,3.058298e-15
1,6v0v_A,775,-0.130439,2.716230e-04
2,5whq_A,768,0.606435,2.587472e-78
3,6khh_A,191,0.537745,1.044832e-15
4,3tg4_A,433,0.435331,1.888823e-21
...,...,...,...,...
4804,3p99_A,453,0.471970,1.652484e-26
4805,2f6b_A,206,0.445144,2.028807e-11
4806,1pjq_A,457,0.610368,5.432145e-48
4807,1ebg_A,436,0.586729,1.108318e-41


In [13]:
res_noarray.to_csv('msa_esm_entropy_noarray.csv',index=False)

## analize

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('msa_esm_entropy.csv')
df1

Unnamed: 0,prot_id,msa_entropy,esm_entropy,pearson_r,p_value
0,5eec_A,[0. 0.53772226 2.13036894 3.35130217 2...,[9.0564322e-01 1.8913814e+00 3.2183812e+00 2.9...,0.456331,3.058298e-15
1,6v0v_A,[0.98536652 0.01188657 1.98049885 1.57532527 1...,[3.13147485e-01 2.47064257e+00 1.45639360e+00 ...,-0.130439,2.716230e-04
2,5whq_A,[0.00000000e+00 2.23592635e+00 1.36052731e+00 ...,[2.43223980e-01 3.26497841e+00 1.50931597e+00 ...,0.606435,2.587472e-78
3,6khh_A,[2.92039107e-01 8.54345017e-03 3.10317401e+00 ...,[8.39754101e-03 1.53277488e-03 1.08932006e+00 ...,0.537745,1.044832e-15
4,3tg4_A,[0.09140162 1.81190199 2.53601814 1.9275623 1...,[3.69597673e-02 2.59197283e+00 1.66781223e+00 ...,0.435331,1.888823e-21
...,...,...,...,...,...
4804,3p99_A,[0. 1.65349003 2.46859376 0.3862838 0...,[1.74470365e+00 1.76842976e+00 1.05141151e+00 ...,0.471970,1.652484e-26
4805,2f6b_A,[0. 1.78586202 1.75643636 1.16421814 2...,[1.59214175e+00 2.84772563e+00 2.76754594e+00 ...,0.445144,2.028807e-11
4806,1pjq_A,[0.05140232 2.44628068 2.90355678 1.38316541 0...,[9.10372124e-04 2.65974253e-01 1.36807990e+00 ...,0.610368,5.432145e-48
4807,1ebg_A,[0. 0.35123831 3.36025062 3.09623727 1...,[2.91405972e-02 1.23471475e+00 1.64380836e+00 ...,0.586729,1.108318e-41


In [3]:
df2 = pd.read_csv('/home/liuyu/codes/ifold/my_works/rmsd_compare_res.csv',usecols=['prot_id','prot_lenth','MSA_count'])
df2

Unnamed: 0,prot_id,prot_lenth,MSA_count
0,5fdu_1y,10,1
1,7arh_V,10,2
2,1lvb_C,10,1
3,5gic_C,10,1
4,3rwi_C,10,1
...,...,...,...
3594,3cmu_A,304,17711
3595,6qw6_5A,27,4500
3596,6g2i_A,166,12114
3597,6rw8_A,1319,12306


In [4]:
df3 = pd.merge(df2,df1).dropna()
df3

Unnamed: 0,prot_id,prot_lenth,MSA_count,msa_entropy,esm_entropy,pearson_r,p_value
22,1mdi_B,13,20,[0. 0. 0. 0. 0...,[1.5165472 2.3614123 2.842379 2.2770755 3.461...,-0.097147,7.522124e-01
26,1mvu_P,13,1,[0.97986876 0. 0. 0. 0...,[2.4798949 1.8946745 2.0342958 1.3810273 1.188...,0.521566,6.754653e-02
37,5v21_B,15,420,[0. 0. 0.39314107 0.08261709 0...,[1.9311681 2.7289474 1.7434655 2.4443135 1.956...,0.094354,7.380212e-01
50,3plu_C,18,466,[0. 0. 0. 1.11727865 1...,[2.4115262 1.0662894 0.46427396 0.7137516 0...,-0.025769,9.166011e-01
57,6zqs_B,14,57,[0. 0. 0. 0. 0...,[2.41609 1.8512884 2.5281353 2.6452045 2.470...,0.298147,2.016804e-01
...,...,...,...,...,...,...,...
3594,3cmu_A,304,17711,[0. 0. 0. ... 0. 0. 0.],[1.9520988 2.2837095 3.6516654 ... 0.004921...,0.093066,2.439455e-05
3595,6qw6_5A,27,4500,[0.82715952 2.14088804 1.76902941 ... 3.264778...,[0.16087885 1.2384948 1.3056793 ... 1.555124...,0.073952,3.736236e-04
3596,6g2i_A,166,12114,[0. 1.23572566 0.52423445 ... 1.930084...,[1.6928067 2.2653956 2.6691034 ... 2.542458 1...,0.349298,2.772611e-68
3597,6rw8_A,1319,12306,[0. 1.95328887 2.73970127 ... 1.321114...,[3.0134244 2.6460645 2.175682 ... 2.1114886 1...,0.336401,8.479259e-68


In [5]:
df4 = df3[(df3['prot_lenth']>50) & (df3['MSA_count']>1000)]
df4

Unnamed: 0,prot_id,prot_lenth,MSA_count,msa_entropy,esm_entropy,pearson_r,p_value
434,4v49_BF,52,1025,[0.00460456 2.13137226 1.02543201 0.65915615 0...,[0.00709218 1.1334897 0.16219354 0.8754334 0...,0.599165,2.693729e-06
435,1le8_A,53,1031,[0. 1.46899559 2.57355111 0.96351611 0...,[1.4378254 1.5161511 1.3347821 0.95238936 0...,0.605589,1.552497e-06
437,1b2o_A,54,1043,[0.14089074 2.75968018 2.31236514 1.56122529 3...,[3.1979096e-03 2.6130733e-01 2.1698931e-01 1.4...,0.571877,6.265527e-06
439,3esw_B,55,1050,[0. 0. 0.87981967 0.16232618 0...,[2.0481043 3.5872295 0.91768605 0.7643086 1...,0.265077,5.048213e-02
440,2ysa_A,55,1017,[0. 2.44851486 2.49466547 2.3664909 3...,[1.1628559 0.3024863 0.14397605 0.59923226 0...,0.195206,1.532297e-01
...,...,...,...,...,...,...,...
3593,7cu3_A,65,9582,[0. 0.02341565 1.71406128 ... 0.970647...,[0.2502526 1.5579014 1.8002597 ... 1.9835308 1...,0.490787,5.058310e-106
3594,3cmu_A,304,17711,[0. 0. 0. ... 0. 0. 0.],[1.9520988 2.2837095 3.6516654 ... 0.004921...,0.093066,2.439455e-05
3596,6g2i_A,166,12114,[0. 1.23572566 0.52423445 ... 1.930084...,[1.6928067 2.2653956 2.6691034 ... 2.542458 1...,0.349298,2.772611e-68
3597,6rw8_A,1319,12306,[0. 1.95328887 2.73970127 ... 1.321114...,[3.0134244 2.6460645 2.175682 ... 2.1114886 1...,0.336401,8.479259e-68
