In [None]:
import numpy as np
import random as rd
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
import torch.nn as nn
import torch.optim as optim
import wandb
import random

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

df_x = pd.read_csv('./padel_83_1490.csv', index_col=0)  # Atributos (Eliminamos la primera columna "molecula: mol_1, mol2,...")
df_y = pd.read_csv('./FEB_catechol83.csv', index_col=0) # Target (Eliminamos la primera columna "index_mol: 1, 2, 3, 4")
df_x_rfe = df_x
df_y_rfe = df_y

### 1. Selección de Características de Acuerdo a Recursive Feature Elimination

In [None]:
# Eliminamos caracteristicas con el 90% de valores iguales a 0
ceros_por_columna = (df_x_rfe == 0).sum()
columnas_a_eliminar = ceros_por_columna[ceros_por_columna >= 73].index.tolist()
df_x_rfe = df_x_rfe.drop(columns=columnas_a_eliminar)

# Con el dataset reducido aplicamos RFE
y = df_y_rfe.values
X = df_x_rfe.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

modelo_base = RandomForestRegressor(n_estimators=100, random_state=42) # Aplicar RFE para seleccionar las 131 mejores características
rfe = RFE(estimator=modelo_base, n_features_to_select=131)
rfe.fit(X_train, y_train.ravel())

columnas_seleccionadas = df_x_rfe.columns[rfe.support_] # Obtener columnas seleccionadas
print("\nColumnas seleccionadas por RFE:")
print(columnas_seleccionadas.tolist())


Columnas seleccionadas por RFE:
['AMR', 'AATS6m', 'AATS5v', 'AATS5e', 'AATS7e', 'AATS5p', 'AATS4i', 'AATS7i', 'AATS4s', 'ATSC2c', 'ATSC4c', 'ATSC8c', 'ATSC6m', 'ATSC7v', 'ATSC7e', 'ATSC6p', 'ATSC7p', 'ATSC7i', 'ATSC3s', 'ATSC5s', 'ATSC6s', 'ATSC8s', 'AATSC2c', 'AATSC8m', 'AATSC5v', 'AATSC4e', 'AATSC5e', 'AATSC7e', 'AATSC2p', 'AATSC8p', 'AATSC2i', 'AATSC7s', 'MATS5c', 'MATS5m', 'MATS4e', 'MATS7e', 'MATS2p', 'MATS6p', 'MATS7p', 'MATS8p', 'MATS2i', 'MATS6i', 'MATS7i', 'MATS2s', 'MATS5s', 'MATS7s', 'GATS7c', 'GATS2m', 'GATS5v', 'GATS6v', 'GATS7v', 'GATS1e', 'GATS4e', 'GATS5e', 'GATS6e', 'GATS1p', 'GATS2i', 'GATS1s', 'GATS2s', 'GATS7s', 'VE3_DzZ', 'VE2_Dzv', 'VR2_Dzv', 'VE1_Dze', 'VE2_Dze', 'VR1_Dzi', 'VR2_Dzi', 'VR3_Dzi', 'VE3_Dzs', 'VR1_Dzs', 'VR2_Dzs', 'BCUTw-1h', 'BCUTc-1l', 'nBondsM', 'SpMax3_Bhm', 'SpMin3_Bhm', 'SpMin4_Bhm', 'SpMin6_Bhm', 'SpMax2_Bhv', 'SpMin5_Bhv', 'SpMax2_Bhe', 'SpMin2_Bhe', 'SpMax2_Bhi', 'SpMax8_Bhs', 'SpMin2_Bhs', 'SpMin3_Bhs', 'SC-3', 'VC-3', 'VPC-5', 'ASP-0', '

### 2. Selección de Características de Acuerdo al Paper

In [None]:
columnas_relevantes = [
    'SHaaCH', 'SpMin3_Bhm', 'VE3_Dzi', 'SpMax2_Bhe', 'SaaCH', 'SpMax2_Bhp', 'AATSC2s', 'MLFER_L', 'SpMin3_Bhs', 'SpMax2_Bhi',
    'VE2_Dt', 'SHother', 'SpDiam_Dt', 'SpMax2_Bhv', 'GATS2s', 'hmin', 'JGI3', 'SpMax2_Bhm', 'VR2_Dt', 'SpMin3_Bhe', 'nBondsM',
    'ATSC2s', 'ATSC2c', 'SwHBa', 'ASP-0', 'minHBa', 'ETA_Beta_ns', 'IC5', 'VR1_Dzm', 'ATS2v', 'MLogP', 'SpMin2_Bhs',
    'maxHBa', 'ATSC4m', 'ATSC4s', 'AATSC7e', 'VE3_Dt', 'VR2_Dzi', 'ETA_Shape_P', 'SpMin2_Bhv', 'VP-5', 'AATS6p', 'VE1_Dzi',
    'minHother', 'MATS5e', 'SpMax7_Bhm', 'VP-6', 'SpMax3_Bhp', 'GATS1s', 'VE3_Dzm', 'VR1_Dze', 'ATS3v', 'SpMin2_Bhe',
    'ATS2s', 'MLFER_BO', 'GATS7c', 'VR1_Dzi', 'AATSC2c', 'ATSC1m', 'MLFER_BH', 'MATS4s', 'AATSC4c', 'MATS2s', 'VE3_Dze',
    'ZMIC4', 'AATSC4s', 'SaasC', 'AATSC8e', 'MAXDP', 'GATS1e', 'AATS7i', 'AATSC8s', 'AATSC5c', 'AATS8i', 'EE_Dt',
    'AATSC5e', 'SpMax3_Bhi', 'ATSC3p', 'VE3_DzZ', 'MATS2p', 'SpMAD_D', 'MATS5m', 'IC4', 'VE1_Dt', 'VR3_Dzp', 'SpMax3_Bhv',
    'ATSC5e', 'MATS2i', 'SpMAD_Dzs', 'MATS7e', 'SpMin3_Bhi', 'VE2_Dzi', 'MIC2', 'AATSC6p', 'SpMax6_Bhi', 'VR1_DzZ',
    'VE2_DzZ', 'CIC4', 'SpMin2_Bhm', 'AATSC4e', 'VE1_DzZ', 'VE3_D', 'MATS3i', 'ATSC5c', 'VR3_Dzv', 'AATSC2i', 'MATS5c',
    'SpMin2_Bhp', 'SpMin3_Bhp', 'MAXDP2', 'ATSC7s', 'MATS5s', 'SpMin8_Bhs', 'ATSC3i', 'VE2_Dze', 'GATS2m', 'AATSC5v',
    'MIC5', 'SpMin2_Bhi', 'ATSC7e', 'SpMax3_Bhm', 'ATSC8e', 'BCUTp-1l', 'AATSC2v', 'VE1_Dze', 'ATS6p', 'GATS6p',
    'VR3_Dzi', 'ATS1s', 'MATS2v', 'piPC2'
]
df_x_filt = df_x[columnas_relevantes] # Filtramos el dataset con las columnas relevantes
X = df_x_filt.values    # Features
y = df_y.values  # Target

### 3. Evaluación del modelo K Nearest Neighbors

In [None]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train, y_train.ravel())
y_pred_knn = knn.predict(X_test)

r2 = r2_score(y_test, y_pred_knn)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_knn))
mse = mean_squared_error(y_test, y_pred_knn)
mae = mean_absolute_error(y_test, y_pred_knn)

print(f'R²:   {r2:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MSE:  {mse:.4f}')
print(f'MAE:  {mae:.4f}')

R²:   0.6803
RMSE: 0.0833
MSE:  0.0069
MAE:  0.0672


### 4. Evaluación de Redes Neuronales

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)           # train (60%), val (20%), test (20%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)


In [None]:
def set_seed(seed=13):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(13)

In [None]:
wandb.init(project="challenge2", entity="zerocris13")
class CompactNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LayerNorm(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 16),
            nn.LeakyReLU(0.1),
            nn.Linear(16, 1)
        )
    def forward(self, x):
        return self.net(x)

model = CompactNN(X_train_tensor.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr= 0.0038576, weight_decay= 0.0067415)

In [None]:
epochs = 300
batch_size = 16
patience = 25
best_val_loss = float('inf')
counter = 0

for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(X_train_tensor.size(0))
    train_loss = 0.0

    for i in range(0, X_train_tensor.size(0), batch_size):
        indices = permutation[i:i+batch_size]
        if len(indices) < 2: continue  # evitar lote de 1 muestra

        x_batch = X_train_tensor[indices]
        y_batch = y_train_tensor[indices]

        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()

        for name, param in model.named_parameters():
            if param.grad is not None:
                wandb.log({f"gradients/{name}": param.grad.norm().item(), "epoch": epoch + 1})

        optimizer.step()
        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_output = model(X_val_tensor)
        val_loss = criterion(val_output, y_val_tensor).item()

    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss / (X_train_tensor.size(0) // batch_size),
        "val_loss": val_loss
    })

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), "model_val.pth")
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping en epoch {epoch+1}")
            break

Early stopping en epoch 36


In [None]:
model.load_state_dict(torch.load("best_model_val.pth"))
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor).cpu().squeeze()
    y_true_tensor = y_test_tensor.cpu().squeeze()
    train_y_pred_tensor = model(X_train_tensor).cpu().squeeze()
    train_y_true_tensor = y_train_tensor.cpu().squeeze()

y_pred = y_pred_tensor.numpy()
y_true = y_true_tensor.numpy()

train_y_pred = train_y_pred_tensor.numpy()
train_y_true = train_y_true_tensor.numpy()

# Train
train_r2 = r2_score(train_y_true, train_y_pred)
train_rmse = mean_squared_error(train_y_true, train_y_pred, squared=False)
train_mse = mean_squared_error(train_y_true, train_y_pred)
train_mae = mean_absolute_error(train_y_true, train_y_pred)

# Test
test_r2 = r2_score(y_true, y_pred)
test_rmse = mean_squared_error(y_true, y_pred, squared=False)
test_mse = mean_squared_error(y_true, y_pred)
test_mae = mean_absolute_error(y_true, y_pred)
print("Train metrics:")
print(f"R²:   {train_r2:.4f}")
print(f"RMSE: {train_rmse:.4f}")
print(f"MSE:  {train_mse:.4f}")
print(f"MAE:  {train_mae:.4f}")

print("Test metrics:")
print(f"R²:   {test_r2:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"MSE:  {test_mse:.4f}")
print(f"MAE:  {test_mae:.4f}")


Train metrics:
R²:   0.8585
RMSE: 0.2498
MSE:  0.0624
MAE:  0.1954
Test metrics:
R²:   0.8207
RMSE: 0.2183
MSE:  0.0477
MAE:  0.1591


  model.load_state_dict(torch.load("best_model_val.pth"))
