In [4]:
import numpy as np
import pandas as pd
import joblib  # per caricare i modelli Random Forest salvati con dump(...)

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import rdFingerprintGenerator

######################################
# 1) Funzione smiles_to_fp
######################################
def smiles_to_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(nBits, dtype=np.uint8)
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)
    fp = fpgen.GetFingerprint(mol)
    arr = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

######################################
# 2) Caricamento file test + fingerprint
######################################
df_test = pd.read_csv("data/smiles_test.csv")  # con colonna "smiles"
nBits = 1024  # deve corrispondere a quello usato in training
X_list = [smiles_to_fp(s, nBits=nBits) for s in df_test["smiles"]]
X_test = np.array(X_list, dtype=np.float32)

######################################
# 3) Carica i modelli e predici
######################################
task_probs = []  # per salvare le predizioni di ogni task

for i in range(1, 12):
    model_path = f"best_models/rf_task{i}.joblib"
    try:
        rf_model = joblib.load(model_path)
    except FileNotFoundError:
        print(f"Modello {model_path} non trovato. Inserisco colonna di NaN.")
        # Se il modello non esiste, puoi mettere 0.5 oppure np.nan
        probs_i = np.full(len(X_test), np.nan, dtype=np.float32)
        task_probs.append(probs_i)
        continue
    
    # Previsione di probabilità (classe=1) su X_test
    y_proba = rf_model.predict_proba(X_test)[:, 1]  # shape (N,)
    task_probs.append(y_proba)

######################################
# 4) Costruiamo il DataFrame di output
######################################
# Trasponiamo per avere shape (N, 11)
probs_matrix = np.column_stack(task_probs)  # (N, 11)
task_names = [f"task{i}" for i in range(1, 12)]
df_preds = pd.DataFrame(probs_matrix, columns=task_names)

######################################
# 5) Inserisci colonna di indici (senza nome)
######################################
df_preds.insert(0, '', range(len(df_preds)))  # colonna di indici 0..N-1

######################################
# 6) Salva in CSV
######################################
df_preds.to_csv("predictions_rf.csv", index=False)
print("Predizioni salvate in predictions_rf.csv")




Predizioni salvate in predictions_rf.csv
