In [1]:
import pandas as pd
import numpy as np
import faiss
import joblib
from rdkit import Chem
from rdkit.Chem import AllChem


In [5]:
# **Step 1: Convert SMILES to ECFP fingerprints**
def smiles_to_ecfp(smiles_list, radius=2, n_bits=2048):
    """Convert a list of SMILES strings to ECFP fingerprints."""
    ecfp_features = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            ecfp_features.append(list(fp))
        else:
            ecfp_features.append([0] * n_bits)  # Fill with zeros if parsing fails
    return np.array(ecfp_features)

In [3]:
# **Step 2: Load Parquet data**
def load_dataset(train_file):
    """Load a single Parquet file and convert it into ECFP features."""
    print(f"Loading {train_file}...")
    # Read the entire Parquet file
    df = pd.read_parquet(train_file)
    
    # **Extract X (SMILES only)**
    smiles_list = df.iloc[:, 0].tolist()  # First column is 'smiles'
    X = smiles_to_ecfp(smiles_list, n_bits=2048)  # Generate ECFP2048 as the only features
    
    # **Extract y (target values)**
    y = df.iloc[:, 1:].values.astype(np.float32)  # All remaining columns are targets
    
    return X, y


In [4]:
# **Step 3: Train FAISS KNN index**
def train_faiss_knn(X_train, y_train, index_file="faiss_knn.index", y_file="faiss_y_train.pkl"):
    """Create and store a FAISS index."""
    d = X_train.shape[1]  # Get feature dimension
    index = faiss.IndexFlatL2(d)  # Use L2 (Euclidean) distance
    
    print("Training FAISS index...")
    index.add(X_train)  # Add training data to the FAISS index
    
    # **Save index**
    faiss.write_index(index, index_file)
    joblib.dump(y_train, y_file)  # Save target values
    print(f"FAISS index saved to {index_file} and {y_file}")


In [8]:
# **Step 4: Use KNN for prediction**
def knn_predict(X_test, k=5, index_file="../data/models/KNN_Filter/KNN_F_MTL/faiss_knn.index", y_file="../data/models/KNN_Filter/KNN_F_MTL/faiss_y_train.pkl"):
    """Perform prediction using FAISS KNN."""
    index = faiss.read_index(index_file)  # Load FAISS index
    y_train = joblib.load(y_file)  # Load stored target values
    
    distances, indices = index.search(X_test, k)  # Find the k nearest neighbors
    y_pred = np.mean(y_train[indices], axis=1)  # Compute the mean of neighbors' target values
    
    return y_pred



In [9]:
# **Step 5: Run**
if __name__ == "__main__":
    # Specify the single Parquet file
    train_file = "../data/raw_filter_20/train_filter_2.parquet"  # Update this path

    # **Loading dataset**
    X_train, y_train = load_dataset(train_file)

    # Ensure X_train is float32 and has the correct shape
    X_train = X_train.astype(np.float32)
    X_train = X_train.reshape(-1, X_train.shape[1])

    # Train FAISS KNN index
    train_faiss_knn(X_train, y_train)

Loading ../data/raw_filter_20/train_filter_2.parquet...
Training FAISS index...
FAISS index saved to faiss_knn.index and faiss_y_train.pkl


In [3]:
def load_test_set(test_file):
    """Load a Parquet file and convert SMILES to ECFP features."""
    print(f"Loading test set from {test_file}...")
    # Read the Parquet file
    chunk = pd.read_parquet(test_file)
    
    # Extract SMILES and generate ECFP features
    smiles_list = chunk.iloc[:, 0].tolist()  # First column is 'smiles'
    X_test = smiles_to_ecfp(smiles_list, n_bits=2048)  # 2048 dimensions
    
    # Ensure float32 for FAISS compatibility
    X_test = X_test.astype(np.float32)
    
    print("X_test shape:", X_test.shape)  # Debugging info
    return X_test

In [6]:
# **加载测试数据集 x_test**
x_test = load_test_set("../data/raw_filter_20/test_filter_2.parquet")  

Loading test set from ../data/raw_filter_20/test_filter_2.parquet...
X_test shape: (72230, 2048)


In [18]:
y_pred = knn_predict(x_test, k=11)

In [20]:
x_test = x_test.astype(np.float32)

In [21]:
df_pred = pd.DataFrame(y_pred)
df_pred.columns = [f"{i}" for i in range(df_pred.shape[1])]
print(df_pred.head())  


     0    1    2    3         4    5    6    7    8    9  ...  3095  3096  \
0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.880909  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
4  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

   3097  3098  3099  3100  3101  3102  3103  3104  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 3105 columns]


In [22]:
df_pred.shape

(72230, 3105)

In [13]:
smiles = pd.read_parquet("../data/raw_filter_20/test_filter_2.parquet")


In [14]:
smiles1 = smiles["smiles"]

In [23]:
df_pred.insert(0, "smiles", smiles1)  

In [24]:
df_pred.head()

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,3095,3096,3097,3098,3099,3100,3101,3102,3103,3104
0,CCC(C)[C@@H](NC(=O)[C@@H](NC(=O)CCCCCCCCCCCCCC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CC(C)Oc1ccccc1C1C(C(=O)C(C)C)C(=O)C(=O)N1c1ccc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC(=O)O[C@H]1C2=C(C)C(=O)O[C@@]2(O)C[C@@]2(C)[...,0.0,0.0,0.0,0.0,0.880909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,COc1cc2ccc(C(O)(c3cnco3)C(C)C)cc2cc1OC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,COc1ccc(-c2ccc(Cl)c(C(=O)NCCc3ccccc3Cl)c2)nn1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_pred.to_parquet('predictions_KNN11_filter.parquet', index=False)