In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


# Libraries





In [2]:

import pandas as pd
import numpy as np
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, DataStructs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import warnings

In [3]:
# Disable warnings and RDKit logs
warnings.filterwarnings("ignore")
RDLogger.DisableLog('rdApp.*')


# Loadin the Data


In [5]:

df_train = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
df_test = pd.read_csv("smiles_test.csv", index_col=0).reset_index(drop=True)
print("Data loaded: ", df_train.shape, df_test.shape)



Data loaded:  (12000, 12) (5896, 1)



# Preparing Train/Validation Split




In [6]:

X_train_raw, X_val_raw, Y_train, Y_val = train_test_split(
    df_train['smiles'], df_train.iloc[:, 1:], test_size=0.2, random_state=42
)



# Fingerprint Generator

In [7]:

def smiles_to_morgan(smiles_list, radius=2, n_bits=1024):
    fps = np.zeros((len(smiles_list), n_bits))
    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            arr = np.zeros((1,))
            DataStructs.ConvertToNumpyArray(fp, arr)
            fps[i] = arr
    return fps

X_train_fps = smiles_to_morgan(X_train_raw)
X_val_fps = smiles_to_morgan(X_val_raw)




# Training the Models (SMOTE + RF)

In [8]:

def train_random_forest_models(X_train_fps, Y_train, X_eval_fps):
    n_tasks = Y_train.shape[1]
    predictions = np.zeros((X_eval_fps.shape[0], n_tasks))
    models = []

    for i in range(n_tasks):
        y = Y_train.iloc[:, i]
        valid_idx = y != -1
        X_sub = X_train_fps[valid_idx]
        y_sub = y[valid_idx]

        if len(np.unique(y_sub)) < 2:
          continue  # skip this task if only one class is present

        sm = SMOTE(sampling_strategy='auto', random_state=42)
        X_balanced, y_balanced = sm.fit_resample(X_sub, y_sub)

        clf = RandomForestClassifier(n_estimators=500, max_depth=12, class_weight="balanced", random_state=42)


        clf.fit(X_balanced, y_balanced)
        models.append(clf)

        predictions[:, i] = clf.predict_proba(X_eval_fps)[:, 1]

    return models, predictions

models, preds_val = train_random_forest_models(X_train_fps, Y_train, X_val_fps)



# Evaluate AUC

In [9]:

def average_auc(y_true_df, y_pred_array):
    auc_scores = []
    for i in range(y_true_df.shape[1]):
        y_true = y_true_df.iloc[:, i]
        valid = y_true != -1
        if valid.sum() == 0:
            continue
        score = roc_auc_score(y_true[valid], y_pred_array[valid, i])
        auc_scores.append(score)
    print(f" Mean AUC: {np.mean(auc_scores):.4f}")
    return np.mean(auc_scores)

average_auc(Y_val, preds_val)



 Mean AUC: 0.7646


np.float64(0.7646369073108868)

# Retrain on Full Dataset + Predict

In [10]:

X_full_fps = smiles_to_morgan(df_train["smiles"])
X_test_fps = smiles_to_morgan(df_test["smiles"])

_, final_predictions = train_random_forest_models(X_full_fps, df_train.iloc[:, 1:], X_test_fps)



# Submission File

In [11]:
#
submission = pd.DataFrame(final_predictions, columns=df_train.columns[1:])
submission.index.name = "Id"
submission.to_csv("final_output.csv")
print(" Submission saved as submission.csv")

 Submission saved as submission.csv
