In [124]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import rdFingerprintGenerator
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

n_bits = 2048
def smiles_to_fp(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Restituisce un array di zeri con tipo intero, come i bit vector originali
        return np.zeros(nBits, dtype=np.uint8)

    # 1. Crea un'istanza del Morgan fingerprint generator
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)

    # 2. Usa il generatore per ottenere l'oggetto fingerprint (ExplicitBitVect)
    #    Il metodo corretto è GetFingerprint()
    fp = fpgen.GetFingerprint(mol) # <<<--- CORREZIONE QUI

    # 3. Converti l'oggetto fingerprint in un NumPy array
    #    Questo passaggio rimane uguale e ora funzionerà con l'oggetto 'fp'
    arr = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

df = pd.read_csv("data/data_train.csv")

task_cols = [f"task{i}" for i in range(1,12)]
for c in task_cols:
    df[c] = df[c].map({
        -1: 0,   # inattivo -> 0
        1: 1,    # attivo -> 1
        0: np.nan  # unknown -> NaN
    })

df = df.dropna(subset=task_cols, how='all')
X = np.array([smiles_to_fp(s, nBits=n_bits) for s in df["smiles"]], dtype=np.float32)
Y = df[task_cols].values.astype(np.float32)                    # shape (N, 11) con 0/1




In [125]:
# map_dict = {-1: 0, 0: 1, 1: 2}
# df[task_cols] = df[task_cols].replace(map_dict).astype(np.float32)

# Y_mapped = df[task_cols].values  # shape (N, 11), dtype float32

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print(X_train.dtype, X_train.shape)
print(Y_train.dtype, Y_train.shape)

float32 (9224, 2048)
float32 (9224, 11)


In [126]:
import torch
from torch.utils.data import Dataset, DataLoader

class FingerprintDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.from_numpy(X).float()  # shape (N, 2048)
        self.Y = torch.from_numpy(Y).float()  # shape (N, 11)
        self.n_samples = self.X.shape[0]
    
    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]  # (features, label_vector)

dataset = FingerprintDataset(X, Y)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

train_dataset = FingerprintDataset(X_train, Y_train)
test_dataset  = FingerprintDataset(X_test, Y_test)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [127]:
import torch
import torch.nn as nn

class MultiTaskBinaryNet(nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=256, num_tasks=11, num_layers=2, dropout_prob=0.2):
        """
        Args:
            input_dim (int): dimensione dell'input (ad es. fingerprint di 2048)
            hidden_dim (int): numero di neuroni nei layer intermedi
            num_tasks (int): numero di task / teste in uscita
            num_layers (int): quanti layer 'densi' nel blocco condiviso (default 2)
            dropout_prob (float): probabilità di dropout (default 0.2)
        """
        super(MultiTaskBinaryNet, self).__init__()
        
        layers = []
        
        # Primo layer: input_dim -> hidden_dim
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_prob))
        
        # Aggiungi i layer successivi: hidden_dim -> hidden_dim
        # (num_layers-1) volte
        for _ in range(num_layers - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_prob))
        
        # Raggruppiamo i layer condivisi in un Sequential
        self.shared_net = nn.Sequential(*layers)
        
        # 11 "testine" di output, ciascuna 1 neurone (logit)
        self.heads = nn.ModuleList([
            nn.Linear(hidden_dim, 1) for _ in range(num_tasks)
        ])

    def forward(self, x):
        """
        x: shape (batch_size, input_dim)
        Ritorna una lista di length num_tasks,
        ognuna shape (batch_size, 1) => i logit per ogni task
        """
        # Passa attraverso i layer condivisi
        x = self.shared_net(x)
        
        # Calcolo delle 11 uscite
        outputs = [head(x) for head in self.heads]
        
        return outputs


In [128]:
import torch.nn as nn

# useremo 'none' per poi fare la media su tutti i task manualmente

def multi_task_bce_loss_masked(outputs, targets):
    """
    outputs: lista di [batch_size,1]
    targets: shape (batch_size, num_tasks), con 0,1 o NaN
    """
    bce = nn.BCEWithLogitsLoss(reduction='none')
    losses = []
    for i in range(len(outputs)):
        logit_i = outputs[i].squeeze(1)      # (batch_size,)
        target_i = targets[:, i]            # (batch_size,)
        
        mask = ~torch.isnan(target_i)
        if mask.any():
            # BCE solo sui sample dove la label non è NaN
            filtered_loss = bce(logit_i[mask], target_i[mask])
            losses.append(filtered_loss)
    
    if len(losses) == 0:
        # Nel caso estremo non ci sia nessuna label definita
        return torch.tensor(0.0, requires_grad=True, device=outputs[0].device)
    
    # Concateniamo e facciamo la media su TUTTI i sample di TUTTI i tasks definiti
    all_losses = torch.cat(losses, dim=0)
    return all_losses.mean()




In [129]:
import torch.optim as optim

device = torch.device("mps")
print("Device:", device)

model = MultiTaskBinaryNet(
    input_dim=n_bits,
    hidden_dim=256,
    num_tasks=11,
    num_layers=3
).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    sample_count = 0
    
    for batch_x, batch_y in train_loader:
        batch_x = batch_x.to(device)  # (batch_size, 2048)
        batch_y = batch_y.to(device)  # (batch_size, 11)
        
        optimizer.zero_grad()
        
        outputs = model(batch_x)  # lista di 11 tensor (batch_size, 1)
        loss = multi_task_bce_loss_masked(outputs, batch_y)
        
        loss.backward()
        optimizer.step()
        
        batch_size_ = batch_x.size(0)
        running_loss += loss.item() * batch_size_
        sample_count += batch_size_
    
    epoch_loss = running_loss / sample_count
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")


Device: mps
Epoch 1/10, Loss: 0.3697
Epoch 2/10, Loss: 0.3519
Epoch 3/10, Loss: 0.3385
Epoch 4/10, Loss: 0.3327
Epoch 5/10, Loss: 0.3377
Epoch 6/10, Loss: 0.3288
Epoch 7/10, Loss: 0.3213
Epoch 8/10, Loss: 0.3308
Epoch 9/10, Loss: 0.3101
Epoch 10/10, Loss: 0.3089


In [130]:
model.eval()
test_loss = 0.0
correct_counts = [0]*11
total_counts = [0]*11

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        
        # outputs = lista di 11 tensori (batch_size, 1) => logit grezzi
        outputs = model(batch_x)
        
        # 1) Calcolo della BCE media su tutti i task
        loss = multi_task_bce_loss_masked(outputs, batch_y)
        # Se la tua multi_task_bce_loss fa la media (batch e tasks),
        # moltiplica per batch_x.size(0) per sommare "loss * nSample"
        test_loss += loss.item() * batch_x.size(0)
        
        # 2) Calcolo dell'accuracy per ciascun task
        for i in range(11):
            logit_i = outputs[i].squeeze(dim=1)   # (batch_size,)
            target_i = batch_y[:, i]             # (batch_size,)

            # 1) maschera booleana: True dove la label NON è NaN
            mask = ~torch.isnan(target_i)

            # 2) calcoliamo prob e pred SOLO su quei sample
            prob_i  = torch.sigmoid(logit_i[mask])  # (numero_di_sample_validi,)
            preds_i = (prob_i >= 0.5).long()         # (numero_di_sample_validi,)

            valid_targets = target_i[mask].long()    # 0 o 1

            # 3) calcolo accuracy
            correct_counts[i] += (preds_i == valid_targets).sum().item()
            total_counts[i]   += len(valid_targets)

# Calcolo della "average loss" sul test set
test_loss = test_loss / len(test_loader.dataset)
print(f"Test Loss: {test_loss:.4f}")

# Stampo l'accuracy di ciascun task
for i in range(11):
    acc_i = correct_counts[i] / total_counts[i]
    print(f"Task {i+1} - Accuracy: {acc_i*100:.2f}%")


Test Loss: 0.3403
Task 1 - Accuracy: 91.24%
Task 2 - Accuracy: 46.83%
Task 3 - Accuracy: 78.03%
Task 4 - Accuracy: 60.00%
Task 5 - Accuracy: 85.66%
Task 6 - Accuracy: 93.15%
Task 7 - Accuracy: 95.26%
Task 8 - Accuracy: 82.54%
Task 9 - Accuracy: 76.30%
Task 10 - Accuracy: 96.69%
Task 11 - Accuracy: 98.27%


In [131]:
# Test Loss: 0.5958
# Task 1 - Accuracy: 89.18%
# Task 2 - Accuracy: 56.35%
# Task 3 - Accuracy: 85.23%
# Task 4 - Accuracy: 56.57%
# Task 5 - Accuracy: 86.07%
# Task 6 - Accuracy: 93.03%
# Task 7 - Accuracy: 90.05%
# Task 8 - Accuracy: 79.37%
# Task 9 - Accuracy: 81.52%
# Task 10 - Accuracy: 91.74%
# Task 11 - Accuracy: 98.27%


# ## ---------
# Test Loss: 0.6776
# Task 1 - Accuracy: 88.66%
# Task 2 - Accuracy: 53.97%
# Task 3 - Accuracy: 85.98%
# Task 4 - Accuracy: 64.00%
# Task 5 - Accuracy: 84.69%
# Task 6 - Accuracy: 93.03%
# Task 7 - Accuracy: 95.26%
# Task 8 - Accuracy: 82.54%
# Task 9 - Accuracy: 76.78%
# Task 10 - Accuracy: 95.04%
# Task 11 - Accuracy: 98.27%