In [108]:
import pandas as pd
import numpy as np

def write_pic50_txt(smi_act_list, out_file):
    with open(out_file, 'w') as file:
        for smi_act in smi_act_list:
            smi = smi_act[0]
            pic50 = smi_act[1]
            file.write(f"{smi},{pic50:.4f}\n")


In [113]:
binding_df = pd.read_csv('bindingdb_fgfr3.tsv', delimiter='\t')
# 清洗IC50数据：去除空白，转换为浮点数
binding_df['IC50 (nM)'] = pd.to_numeric(binding_df['IC50 (nM)'].str.replace(' ', ''), errors='coerce')

# 计算pIC50
binding_df['pIC50'] = -np.log10(binding_df['IC50 (nM)'] * 1e-9)

In [114]:
clean_df = binding_df.dropna(subset=['Ligand InChI', 'pIC50'])

# Create list of tuples (SMILES, pIC50)
smiles_pic50_pairs = list(zip(clean_df['Ligand InChI'], clean_df['pIC50']))

In [112]:
clean_df['pIC50'].mean()

7.860693321581272

In [115]:
clean_df['pIC50'].mean()

7.5970772585257444

In [97]:
from rdkit import Chem
for i in range(len(smiles_pic50_pairs)):
    smi = smiles_pic50_pairs[i][0]
    
    can_smi = Chem.MolToSmiles(Chem.MolFromInchi(smi))
    smiles_pic50_pairs[i] = (can_smi, smiles_pic50_pairs[i][1])

In [98]:
num_smiles = len(smiles_pic50_pairs)
write_pic50_txt(smiles_pic50_pairs, f'fgfr1_ki_{num_smiles}.txt')
num_smiles

192

In [101]:
binding_df['Ki (nM)']

0         1.6
1         1.9
2        7.90
3        7.90
4        7.92
        ...  
4940      NaN
4941      NaN
4942      NaN
4943      NaN
4944      NaN
Name: Ki (nM), Length: 4945, dtype: object

In [104]:
import re
def extract_number(text):
    text = str(text)
    match = re.search(r'\d+', text)
    return float(match.group()) if match else None

binding_df = pd.read_csv('bindingdb_fgfr1.tsv', delimiter='\t')

binding_df['Ki (nM)'] = binding_df['Ki (nM)'].apply(extract_number)
binding_df['Ki (nM)'] = pd.to_numeric(binding_df['Ki (nM)'])

# 计算pIC50
binding_df['pIC50'] = -np.log10(binding_df['Ki (nM)'] * 1e-9)

In [105]:
clean_df = binding_df.dropna(subset=['Ligand InChI', 'pIC50'])

# Create list of tuples (SMILES, pIC50)
smiles_pic50_pairs = list(zip(clean_df['Ligand InChI'], clean_df['pIC50']))

In [107]:
len(clean_df)

196