In [None]:
import numpy as np
import pandas as pd
import joblib 

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import rdFingerprintGenerator

import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.MolStandardize import rdMolStandardize


from joblib import dump

from rdkit import RDLogger

# Avoid warning and infos
lg = RDLogger.logger()
lg.setLevel(RDLogger.ERROR)


def standardize_mol(mol):
    # Base cleanup
    mol = rdMolStandardize.Cleanup(mol)
    # Only main fragment 
    lfc = rdMolStandardize.LargestFragmentChooser()
    mol = lfc.choose(mol)
    return mol

def smiles_to_fp(smiles, 
                 nBits=1024, 
                 radius=2, 
                 use_MACCS=False, 
                 standardize=True):

    # Converting SMILES to Mol
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # if fails, zero array size nBits as dim or nBits + 166 with MACCS
        maccs_size = 166 if use_MACCS else 0
        return np.zeros(nBits + maccs_size, dtype=np.uint8)

    # Standardization
    if standardize:
        mol = standardize_mol(mol)

    # Generating fingerprint Morgan
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)
    fp_morgan = fpgen.GetFingerprint(mol)
    arr_morgan = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp_morgan, arr_morgan)

    if not use_MACCS:
        # Return only Morgan
        return arr_morgan

    # Otherwise, MACCS and concat
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    arr_maccs = np.zeros((maccs_fp.GetNumBits(),), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(maccs_fp, arr_maccs)

    # Concat
    combined_fp = np.concatenate([arr_morgan, arr_maccs])
    return combined_fp



df_test = pd.read_csv("../data/smiles_test.csv")

# Converting the columns
X_test = np.array([smiles_to_fp(s, 
                           nBits=1024, 
                           radius=2, 
                           use_MACCS=True, 
                           standardize=True) 
              for s in df_test["smiles"]],
             dtype=np.float32)
nBits = 1024  


# Load models and make prediction 
task_probs = [] 

for i in range(1, 12):
    model_path = f"best_models/rf_task{i}.joblib"
    try:
        rf_model = joblib.load(model_path)
    except FileNotFoundError:
        print(f"Model {model_path} not found. NaN filler.")
        probs_i = np.full(len(X_test), np.nan, dtype=np.float32)
        task_probs.append(probs_i)
        continue
    
    
    y_proba = rf_model.predict_proba(X_test)[:, 1] 
    task_probs.append(y_proba)

# Output dataframe based on example 
probs_matrix = np.column_stack(task_probs)  # (N, 11)
task_names = [f"task{i}" for i in range(1, 12)]
df_preds = pd.DataFrame(probs_matrix, columns=task_names)

df_preds.insert(0, '', range(len(df_preds)))  # index

# Save csv 
df_preds.to_csv("predictions_rf.csv", index=False)
print("Prediction saved in predictions_rf.csv")
