In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsRegressor

from rdkit import Chem
from rdkit.Chem import AllChem

# !pip install faiss-gpu
import faiss

In [None]:

def smiles_to_ecfp(smiles_list, radius=2, n_bits=2048):
    """Convert a list of SMILES strings to ECFP fingerprints."""
    ecfp_features = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            ecfp_features.append(list(fp))
        else:
            ecfp_features.append([0] * n_bits)  
    return np.array(ecfp_features)

# Step 1: Load data & ECFP4 generation

In [None]:

def load_dataset(train_files):
    """Load multiple CSV files and convert them into ECFP features."""
    X_list, y_list = [], []
    
    for f in train_files:
        print(f"Loading {f}...")
        chunk = pd.read_csv(f, encoding="utf-8-sig")
        
        # Generate ECFP4
        smiles_list = chunk.iloc[:, 0].tolist()
        ecfp_features = smiles_to_ecfp(smiles_list, n_bits=2048)  
        other_features = chunk.iloc[:, 1:-6143].values.astype(np.float32)  
        X_chunk = np.hstack((ecfp_features, other_features))  
        
        y_chunk = chunk.iloc[:, -6143:].values.astype(np.float32)
        
        X_list.append(X_chunk)
        y_list.append(y_chunk)
    
    X = np.vstack(X_list)
    y = np.vstack(y_list)

    return X, y

# Step 2: Train FAISS KNN index

In [None]:
def train_faiss_knn(X_train, y_train, index_file="faiss_knn.index", y_file="faiss_y_train.pkl"):
    """Create and store a FAISS index."""
    d = X_train.shape[1] 
    index = faiss.IndexFlatL2(d)  # Use L2 (Euclidean) distance
    
    print("Training FAISS index...")
    index.add(X_train)  # Add training data to the FAISS index

    faiss.write_index(index, index_file)
    joblib.dump(y_train, y_file)
    print("FAISS index saved.")

# Step 3: Use KNN for prediction

In [None]:
def knn_predict(X_test, k=5, index_file="faiss_knn.index", y_file="faiss_y_train.pkl"):
    """Perform prediction using FAISS KNN."""
    index = faiss.read_index(index_file)  
    y_train = joblib.load(y_file)  
    
    # Find the k nearest neighbors
    distances, indices = index.search(X_test, k)  
    y_pred = np.mean(y_train[indices], axis=1)  
    
    return y_pred

# Step4: RUN

In [None]:
if __name__ == "__main__":
    train_files = [
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk1.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk2.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk3.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk4.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk5.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk6.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk7.csv",
        "../uncoverted_dataset_csv/train_chunks_csv/train_chunk8.csv"
    ]

    X_train, y_train = load_dataset(train_files)

Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk1.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk2.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk3.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk4.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk5.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk6.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk7.csv...
Loading ../uncoverted_dataset_csv/train_chunks_csv/train_chunk8.csv...


In [None]:
X_train = X_train.astype(np.float32)
X_train = X_train.reshape(-1, X_train.shape[1])

# Create FAISS index
d = X_train.shape[1]  # Feature dimension
index = faiss.IndexFlatL2(d)  # Use Euclidean distance

index.add(X_train)

In [11]:
train_faiss_knn(X_train, y_train)


Training FAISS index...
FAISS index saved.


# Step5: Load test set & get prediction

In [None]:
def load_test_set(test_file):
    chunk = pd.read_csv(test_file, encoding="utf-8-sig")
    
   
    smiles_list = chunk.iloc[:, 0].tolist()
    ecfp_features = smiles_to_ecfp(smiles_list, n_bits=2048)
    other_features = chunk.iloc[:, 1:-6143].values.astype(np.float32) 
    
    # Combine ECFP features with other features
    X_test = np.hstack((ecfp_features, other_features))

    return X_test

In [None]:
x_test2 = load_test_set("test_smiles12.csv")  

X_test shape: (19358, 2048)


In [None]:
x_test2 = x_test2.astype(np.float32)

In [None]:
y_pred2 = knn_predict(x_test2, k=5)

In [None]:
df_pred2 = pd.DataFrame(y_pred2)
df_pred2.columns = [f"{i}" for i in range(df_pred2.shape[1])]
print(df_pred2.head())  

     0    1    2    3    4    5    6    7    8    9  ...  6133  6134  6135  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   6136  6137  6138  6139  6140  6141  6142  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 6143 columns]


In [None]:
smiles2 = pd.read_csv("test_smiles12.csv")
df_pred2.insert(0, "smiles", smiles2)  

In [30]:
print(df_pred2.head())  

                                              smiles    0    1    2    3    4  \
0  CN1CCC(n2cnc(-c3ccc(F)cc3)c2-c2ccnc(Nc3ccncc3)...  0.0  0.0  0.0  0.0  0.0   
1        Cc1cc(C)nc(-n2nc(C)cc2NC(=O)CN2CCC(C)CC2)n1  0.0  0.0  0.0  0.0  0.0   
2     C#CCSC[C@H](NC(=O)c1cc(OC)c(OC)c(OC)c1)C(=O)OC  0.0  0.0  0.0  0.0  0.0   
3  COc1c(NC(=O)/C(=N/O)c2ccc(OCCN3CCOCC3)c3ccccc2...  0.0  0.0  0.0  0.0  0.0   
4               O=C(O)c1ccccc1Nc1cc(F)cc(C(F)(F)F)c1  0.0  0.0  0.0  0.0  0.0   

     5    6    7    8  ...  6133  6134  6135  6136  6137  6138  6139  6140  \
0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
4  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   

   6141  6142  
0   0.0   0.0  
1   0.0   0.

In [None]:
def get_knn_pred(test_file):
    smiles1 = pd.read_csv(test_file)
    x_test = load_test_set(test_file)  
    x_test = x_test.astype(np.float32)


    pred = knn_predict(x_test, k=5)
    df_pred= pd.DataFrame(pred)
    df_pred.columns = [f"{i}" for i in range(df_pred.shape[1])]
    df_pred.insert(0, "smiles", smiles1)  

    return df_pred

    
    
df_pred4 = get_knn_pred("test_smiles22.csv")
df_pred4.to_csv("test4_pred_KNN.csv", index=False)