In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import joblib
import faiss
from rdkit import Chem
from rdkit.Chem import AllChem
import os

In [3]:
def smiles_to_ecfp(smiles_list, radius=2, n_bits=2048):
    """Convert a list of SMILES to ECFP fingerprints."""
    ecfp_features = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            ecfp_features.append(np.array(fp, dtype=np.float32))  # 直接转换为 float32
        else:
            ecfp_features.append(np.zeros(n_bits, dtype=np.float32))  # 确保数据类型一致
    
    return np.vstack(ecfp_features)  # 确保返回的是 NumPy 数组


In [5]:
df = pd.read_parquet("../data/raw_filter_20/train_filter_2.parquet")

    # Convert SMILES to ECFP**
smiles_list = df.iloc[:, 0].tolist()
X = smiles_to_ecfp(smiles_list, n_bits=2048) 


In [6]:
print(smiles_list[:5])

['CC(=O)O[C@@H]1[C@H](OC(C)=O)[C@@H](CN2C(=O)c3ccccc3C2=O)O[C@H]1n1cnc2c(N)nc(Cl)nc21', 'CC(C)(F)C[C@H](N[C@@H](c1ccc(-c2ccc(S(C)(=O)=O)cc2)cc1)C(F)(F)F)C(=O)NC1CCCN(S(=O)(=O)c2ccccn2)CC1=O', 'CSS/C(CCOC(=O)[C@@H](N)CCCCN)=C(/C)N(C=O)CCCCCCCCCCCCN(C=O)/C(C)=C(/CCOC(=O)[C@@H](N)CCCCN)SSC.Cl.Cl', 'CCCNC(=O)Nc1cc2[nH]nc(OCCOC)c2cn1', 'COc1ccc2[nH]cc(CCNC(=O)c3ccc(OC(F)(F)C(F)F)cc3)c2c1']


In [None]:
pd.DataFrame(X).to_parquet('X_train_f_ECFP.parquet', index  = False)

In [9]:
df = pd.read_parquet("../data/raw_filter_20/test_filter_2.parquet")

    # Convert SMILES to ECFP**
smiles_list = df.iloc[:, 0].tolist()
X = smiles_to_ecfp(smiles_list, n_bits=2048) 

In [10]:
print(smiles_list[:5])

['CCC(C)[C@@H](NC(=O)[C@@H](NC(=O)CCCCCCCCCCCCCCC(=O)N[C@H](CC(=O)N[C@@H](Cc1ccccc1)C(=O)O)C(N)=O)C(C)O)C(=O)N[C@@H](Cc1ccccc1)C(N)=O', 'CC(C)Oc1ccccc1C1C(C(=O)C(C)C)C(=O)C(=O)N1c1ccc(N2CCCC2)cc1', 'CC(=O)O[C@H]1C2=C(C)C(=O)O[C@@]2(O)C[C@@]2(C)[C@@H]3C[C@@H]3[C@](C)(O)[C@@H]12', 'COc1cc2ccc(C(O)(c3cnco3)C(C)C)cc2cc1OC', 'COc1ccc(-c2ccc(Cl)c(C(=O)NCCc3ccccc3Cl)c2)nn1']


In [11]:
pd.DataFrame(X).to_parquet('X_test_f_ECFP.parquet', index  = False)

In [12]:
df['smiles'].to_csv('test_f_smiles.csv', index = False)

In [20]:
df0 = pd.read_parquet("../data/raw_filter_20/train_filter_2.parquet")

## Split Datasets by col and save them as parquet

In [None]:
first_col = df0.columns[0]  
target_cols = df0.columns[1:]  

num_splits = 6
cols_per_split = len(target_cols) // num_splits
remainder = len(target_cols) % num_splits
output_dir = "/home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter"

for i in range(num_splits):
    start_idx = i * cols_per_split
    end_idx = (i + 1) * cols_per_split if i < num_splits - 1 else len(target_cols)

    if i == num_splits - 1:
        current_targets = target_cols[start_idx:]
    else:
        current_targets = target_cols[start_idx:end_idx]

    sub_df = df0[[first_col] + list(current_targets)]
 
    output_file = os.path.join(output_dir, f"train_split_{i+1}.parquet")
    sub_df.to_parquet(output_file)
    print(f"保存子数据集 {i+1} 到 {output_file}，列数: {sub_df.shape[1]}")

print("\n数据集拆分完成！")

加载数据集...
保存子数据集 1 到 /home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter/train_split_1.parquet，列数: 518
保存子数据集 2 到 /home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter/train_split_2.parquet，列数: 518
保存子数据集 3 到 /home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter/train_split_3.parquet，列数: 518
保存子数据集 4 到 /home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter/train_split_4.parquet，列数: 518
保存子数据集 5 到 /home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter/train_split_5.parquet，列数: 518
保存子数据集 6 到 /home/jovyan/proj-liujing/MolE_Evaluation_Project/data/processed/filter/train_split_6.parquet，列数: 521

数据集拆分完成！


In [22]:
df3 = pd.read_parquet('../data/processed/filter/train_split_2.parquet')
df3.head()

Unnamed: 0,smiles,517,518,519,520,521,522,523,524,525,...,1024,1025,1026,1027,1028,1029,1030,1031,1032,1033
0,CC(=O)O[C@@H]1[C@H](OC(C)=O)[C@@H](CN2C(=O)c3c...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CC(C)(F)C[C@H](N[C@@H](c1ccc(-c2ccc(S(C)(=O)=O...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CSS/C(CCOC(=O)[C@@H](N)CCCCN)=C(/C)N(C=O)CCCCC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CCCNC(=O)Nc1cc2[nH]nc(OCCOC)c2cn1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,COc1ccc2[nH]cc(CCNC(=O)c3ccc(OC(F)(F)C(F)F)cc3...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df3.shape

(577840, 518)

In [24]:
df0.shape

(577840, 3106)

In [3]:
result1 = pd.read_parquet("../data/results/predictions_KNN_STL/all_predictions_knn1.parquet")
result2 = pd.read_parquet("../data/results/predictions_KNN_STL/all_predictions_knn2.parquet")
result3 = pd.read_parquet("../data/results/predictions_KNN_STL/all_predictions_knn3.parquet")
result4 = pd.read_parquet("../data/results/predictions_KNN_STL/all_predictions_knn4.parquet")
result5 = pd.read_parquet("../data/results/predictions_KNN_STL/all_predictions_knn5.parquet")
result6 = pd.read_parquet("../data/results/predictions_KNN_STL/all_predictions_knn6.parquet")

In [None]:

dfs = [result1, result2, result3, result4, result5, result6]

for i in range(1, len(dfs)):
    assert dfs[i]['smiles'].equals(dfs[0]['smiles']), f"DataFrame {i+1} 的 SMILES 与第一个不匹配"

result = pd.concat([dfs[0][['smiles']]]+ [df.iloc[:, 1:] for df in dfs], axis=1)


result.to_parquet("combined_dataset.parquet", index=False)
print("DONE", result.shape)