In [None]:
!pip install rdkit -q
print("âœ… Installation complete")

âœ… Installation complete


In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from rdkit.Chem import rdMolDescriptors
from tqdm import tqdm
import warnings
from rdkit import RDLogger
from scipy.sparse import vstack, csr_matrix
import numpy as np
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, f1_score, precision_score, recall_score
)
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASE_PATH = '/content/drive/MyDrive/MLHygnn/BaseLine/RandomForest_Chemical/'
SMILES_FILE = '/content/drive/MyDrive/MLHygnn/BaseLine/Drugs_with_Smiles.csv'

config = {
    'n_estimators': 500,
    'max_depth': 20,
    'random_state': 42
}

print("âœ… Config loaded")

train_pos = pd.read_csv(f'{BASE_PATH}data/train_postive.csv')
train_neg = pd.read_csv(f'{BASE_PATH}data/train_negatives.csv')
test_pos = pd.read_csv(f'{BASE_PATH}data/test_postive.csv')
test_neg = pd.read_csv(f'{BASE_PATH}data/test_negatives.csv')

smiles_df = pd.read_csv(SMILES_FILE)
smiles_dict = dict(zip(smiles_df['DrugBank_ID'], smiles_df['SMILES']))

print(f"âœ… Data loaded: {len(train_pos)} train+, {len(train_neg)} train-, {len(test_pos)} test+, {len(test_neg)} test-")


âœ… Config loaded
âœ… Data loaded: 153501 train+, 153501 train-, 19189 test+, 19189 test-


## We use Morgan Fingerprint (2, 2048)

In [None]:

# Suppress warnings
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')


# DEFINE FUNCTIONS
def smiles_to_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2048)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048))


# PRE-COMPUTE FINGERPRINTS (assumes smiles_dict exists)
print("Pre-computing fingerprints for all drugs...")
drug_fps = {}
for drug_id, smiles in tqdm(smiles_dict.items(), desc="Computing fingerprints"):
    drug_fps[drug_id] = smiles_to_fp(smiles)
print(f"âœ… Pre-computed {len(drug_fps)} drug fingerprints")

# DEFINE FAST PAIR FUNCTION
def pair_features_fast(drug1, drug2):
    fp1 = drug_fps.get(drug1, np.zeros(2048))
    fp2 = drug_fps.get(drug2, np.zeros(2048))
    return np.concatenate([fp1, fp2])

# PREPARE TRAINING DATA - SPARSE
# 
print("\nPreparing training data (sparse)...")
X_train_list = []
y_train = []

for _, row in tqdm(train_pos.iterrows(), total=len(train_pos), desc="Train positive"):
    fp = pair_features_fast(row['Drug1_ID'], row['Drug2_ID'])
    X_train_list.append(csr_matrix(fp))  # Convert to sparse
    y_train.append(1)

for _, row in tqdm(train_neg.iterrows(), total=len(train_neg), desc="Train negative"):
    fp = pair_features_fast(row['Drug1_ID'], row['Drug2_ID'])
    X_train_list.append(csr_matrix(fp))
    y_train.append(0)

X_train = vstack(X_train_list)  # Combine into sparse matrix
y_train = np.array(y_train)
print(f"âœ… Training data ready: {X_train.shape}")
print(f"   Sparsity: {1 - X_train.nnz / np.prod(X_train.shape):.2%}")
print(f"   Memory saved: ~{(1 - X_train.nnz / np.prod(X_train.shape)) * 100:.1f}%")

# PREPARE TEST DATA - SPARSE
print("\nPreparing test data (sparse)...")
X_test_list = []
y_test = []

for _, row in tqdm(test_pos.iterrows(), total=len(test_pos), desc="Test positive"):
    fp = pair_features_fast(row['Drug1_ID'], row['Drug2_ID'])
    X_test_list.append(csr_matrix(fp))
    y_test.append(1)

for _, row in tqdm(test_neg.iterrows(), total=len(test_neg), desc="Test negative"):
    fp = pair_features_fast(row['Drug1_ID'], row['Drug2_ID'])
    X_test_list.append(csr_matrix(fp))
    y_test.append(0)

X_test = vstack(X_test_list)
y_test = np.array(y_test)
print(f"âœ… Test data ready: {X_test.shape}")
print(f"   Sparsity: {1 - X_test.nnz / np.prod(X_test.shape):.2%}")

# MEMORY CHECK
import sys
train_size_mb = (X_train.data.nbytes + X_train.indices.nbytes + X_train.indptr.nbytes) / (1024**2)
test_size_mb = (X_test.data.nbytes + X_test.indices.nbytes + X_test.indptr.nbytes) / (1024**2)
print(f"\nðŸ“Š Memory Usage:")
print(f"   Training data: {train_size_mb:.2f} MB")
print(f"   Test data: {test_size_mb:.2f} MB")
print(f"   Total: {train_size_mb + test_size_mb:.2f} MB")

Pre-computing fingerprints for all drugs...


Computing fingerprints: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1709/1709 [00:02<00:00, 770.61it/s]


âœ… Pre-computed 1709 drug fingerprints

Preparing training data (sparse)...


Train positive: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 153501/153501 [00:42<00:00, 3625.18it/s]
Train negative: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 153501/153501 [00:42<00:00, 3583.36it/s]


âœ… Training data ready: (307002, 4096)
   Sparsity: 97.84%
   Memory saved: ~97.8%

Preparing test data (sparse)...


Test positive: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 19189/19189 [00:05<00:00, 3277.20it/s]
Test negative: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 19189/19189 [00:04<00:00, 4118.40it/s]


âœ… Test data ready: (38378, 4096)
   Sparsity: 97.82%

ðŸ“Š Memory Usage:
   Training data: 312.59 MB
   Test data: 39.28 MB
   Total: 351.87 MB


In [None]:
print("\n" + "="*80)
print("TRAINING RANDOM FOREST")
print("="*80)

rf = RandomForestClassifier(
    n_estimators=config['n_estimators'],
    max_depth=config['max_depth'],
    random_state=config['random_state'],
    n_jobs=-1,  # Change to n_jobs=1 for single-core training
    verbose=1
)

start_time = time.time()
rf.fit(X_train, y_train)
train_time = time.time() - start_time

print(f"âœ… Training complete: {train_time/60:.2f} minutes")

# TEST MODEL
print("\n" + "="*80)
print("TESTING")
print("="*80)

y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)



TRAINING RANDOM FOREST


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 29.4min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


âœ… Training complete: 29.37 minutes

TESTING


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    2.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    6.0s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    6.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    4.8s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    5.4s finished


In [None]:
# PRINT RESULTS
print("\n" + "="*80)
print("RANDOM FOREST - FINAL RESULTS")
print("="*80)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print(f"PR-AUC:    {pr_auc:.4f}")
print(f"\nTraining Time: {train_time/60:.2f} minutes")
print("="*80)


RANDOM FOREST - FINAL RESULTS
Accuracy:  0.7853
Precision: 0.7546
Recall:    0.8456
F1-Score:  0.7975
ROC-AUC:   0.8752
PR-AUC:    0.8779

Training Time: 29.37 minutes
