In [None]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import rdFingerprintGenerator

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, roc_auc_score
from sklearn.utils import shuffle

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from joblib import dump
from tqdm import tqdm
from rdkit import RDLogger

lg = RDLogger.logger()
lg.setLevel(RDLogger.ERROR)

# Read dataset and map the values
df = pd.read_csv("../data/data_train.csv")
task_cols = [f"task{i}" for i in range(1,12)]

for c in task_cols:
    df[c] = df[c].map({
        -1: 0,
        1: 1,
        0: np.nan
    })

import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize_mol(mol):
    # Base cleanup
    mol = rdMolStandardize.Cleanup(mol)
    # Only main fragment 
    lfc = rdMolStandardize.LargestFragmentChooser()
    mol = lfc.choose(mol)
    return mol

def smiles_to_fp(smiles, 
                 nBits=1024, 
                 radius=2, 
                 use_MACCS=False, 
                 standardize=True):

    # Converting SMILES to Mol
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # if fails, zero array size nBits as dim or nBits + 166 with MACCS
        maccs_size = 166 if use_MACCS else 0
        return np.zeros(nBits + maccs_size, dtype=np.uint8)

    # Standardization
    if standardize:
        mol = standardize_mol(mol)

    # Generating fingerprint Morgan
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)
    fp_morgan = fpgen.GetFingerprint(mol)
    arr_morgan = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp_morgan, arr_morgan)

    if not use_MACCS:
        # Return only Morgan
        return arr_morgan

    # Otherwise, MACCS and concat
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    arr_maccs = np.zeros((maccs_fp.GetNumBits(),), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(maccs_fp, arr_maccs)

    # Concat
    combined_fp = np.concatenate([arr_morgan, arr_maccs])
    return combined_fp



# Converting the columns
X = np.array([smiles_to_fp(s, 
                           nBits=1024, 
                           radius=2, 
                           use_MACCS=True, 
                           standardize=True) 
              for s in df["smiles"]],
             dtype=np.float32)

#Split train/test
df_train, df_test, X_train, X_test = train_test_split(df, X, test_size=0.1, random_state=42)

search_spaces = {
    'n_estimators': Integer(5, 2500),  # Continuous range
    'criterion': Categorical(['gini', 'entropy']), # Discrete values 
    'max_depth': Categorical([None, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35]), # Discrete values 
    'min_samples_split': Integer(2, 15), # Continuous range
    'min_samples_leaf': Integer(1, 10),  # Continuous range
    'max_features': Categorical(['sqrt', 'log2', 0.5, 0.7]), # Discrete values 
    'class_weight': Categorical([None, 'balanced']) # Discrete values 
}

auc_list = []

for i, task in enumerate(task_cols, start=1):
    print(f"\n*** Tuning per {task} ***")
    # Selecting sample defined per training 
    train_mask = ~df_train[task].isna()
    X_train_f = X_train[train_mask]
    y_train_f = df_train.loc[train_mask, task].values.astype(int)

    if len(y_train_f) == 0:
        print(f"No train sample found for {task}, skip.")
        continue

    # Define RF
    rf = RandomForestClassifier(random_state=0)  

    n_iterations = 2000

    grid_search = BayesSearchCV(
        estimator=rf,
        search_spaces=search_spaces, 
        n_iter=n_iterations,       
        scoring='roc_auc',
        cv=5,                     
        n_jobs=-1,
        random_state=42           
    )

    grid_search.fit(X_train_f, y_train_f)

    
    best_model = grid_search.best_estimator_
    print(f"Best params for {task}: {grid_search.best_params_}")

    # Saving Model
    dump(best_model, f"best_models/rf_task{i}.joblib")
    print(f"Salvato: rf_task{i}.joblib")

    # Evaluation over test
    test_mask = ~df_test[task].isna()
    X_test_f = X_test[test_mask]
    y_test_f = df_test.loc[test_mask, task].values.astype(int)

    if len(y_test_f) == 0:
        print(f"No test sample found for {task}, skip.")
        continue

    # Predicting prob
    y_proba = best_model.predict_proba(X_test_f)[:, 1]  # output continuo
    y_pred = (y_proba >= 0.5).astype(int)

    precision = precision_score(y_test_f, y_pred, zero_division=0)
    if len(np.unique(y_test_f)) == 2:
        auc_val = roc_auc_score(y_test_f, y_proba)
    else:
        auc_val = np.nan

    print(f"{task} -> Precision={precision:.3f}, AUC={auc_val if not np.isnan(auc_val) else 'N/A'}")
    auc_list.append(auc_val)

valid_aucs = [x for x in auc_list if not np.isnan(x)]
mean_auc = np.mean(valid_aucs) if valid_aucs else np.nan
print(f"\nAUC (AVG) over task: {mean_auc:.3f}")
