In [56]:
!pip install rdkit



In [57]:
!pip install xgboost



In [58]:
import pandas as pd
from pathlib import Path

# Set the data directory
DATA_DIR = Path("/kaggle/input/drug-datasets/")

# List all files in the directory
print("Files in dataset:")
for file in DATA_DIR.rglob("*"):
    print(f"  {file.name} (size: {file.stat().st_size / 1e6:.1f} MB)")

print("\n--- Loading pos.csv / positive.csv ---")
POS_CSV = next(DATA_DIR.rglob("pos.csv"), None) or next(DATA_DIR.rglob("positive.csv"), None)
if POS_CSV:
    pos = pd.read_csv(POS_CSV)
    print(f"Pos shape: {pos.shape}")
    print(f"Pos columns: {list(pos.columns)}")
    print("\nPos head (first 3 rows):\n", pos.head(3))
    print("\nPos describe (numeric cols):\n", pos.describe())
    print(f"Pos DrugBankID sample: {pos['DrugBankID'].iloc[0] if 'DrugBankID' in pos.columns else 'No DrugBankID col'}")
else:
    print("No pos/positive.csv found!")

print("\n--- Loading neg.csv / negative.csv ---")
NEG_CSV = next(DATA_DIR.rglob("neg.csv"), None) or next(DATA_DIR.rglob("negative.csv"), None)  # fixed typo from earlier
if NEG_CSV:
    neg = pd.read_csv(NEG_CSV)
    print(f"Neg shape: {neg.shape}")
    print(f"Neg columns: {list(neg.columns)}")
    print("\nNeg head (first 3 rows):\n", neg.head(3))
    print("\nNeg describe (numeric cols):\n", neg.describe())
    print(f"Neg DrugBankID sample: {neg['DrugBankID'].iloc[0] if 'DrugBankID' in neg.columns else 'No DrugBankID col'}")
else:
    print("No neg/negative.csv found!")

# Check for SMILES file
DB_SMILES = next(DATA_DIR.rglob("DrugBankID2SMILES.csv"), None) or next(DATA_DIR.rglob("*smile*.csv"), None)
if DB_SMILES:
    print(f"\n--- SMILES file: {DB_SMILES.name} ---")
    dbmap = pd.read_csv(DB_SMILES, dtype=str, nrows=5)  # just head
    print(f"SMILES shape (full): unknown, head: {dbmap.shape}")
    print(f"SMILES columns: {list(dbmap.columns)}")
    print("\nSMILES head:\n", dbmap.head())

# Check for other optional files
SE_MAP = next(DATA_DIR.rglob("SE_similarity_2014Q3_2024Q3.csv"), None)
if SE_MAP:
    print(f"\n--- SE_MAP file: {SE_MAP.name} ---")
    se = pd.read_csv(SE_MAP, nrows=3)
    print(f"Columns: {list(se.columns)}")
    print(se.head(3))

SE_EMBED = next(DATA_DIR.rglob("*umls*"), None)
if SE_EMBED:
    print(f"\n--- SE_EMBED file: {SE_EMBED.name} ---")
    embed = pd.read_csv(SE_EMBED, nrows=3)
    print(f"Shape (head): {embed.shape}")
    print(f"Columns: {list(embed.columns)}")
    print(embed.head(3))

Files in dataset:
  Side_effects_unique.csv (size: 63.6 MB)
  neg.csv (size: 12.2 MB)
  SE_similarity_2014Q3_2024Q3.csv (size: 0.5 MB)
  Drugbank_ID_SMILE_all_structure links.csv (size: 4.2 MB)
  pos.csv (size: 12.0 MB)
  DrugBankID2SMILES.csv (size: 0.9 MB)

--- Loading pos.csv / positive.csv ---
Pos shape: (111072, 6)
Pos columns: ['report_id', 'SE_above_0.9', 'DrugBankID', 'hyperedge_label', 'time', 'row_index']

Pos head (first 3 rows):
    report_id SE_above_0.9                                         DrugBankID  \
0   11809573     C0151878  ['DB01050', 'DB00555', 'DB00472', 'DB00273', '...   
1   20088990     C0435002  ['DB06605', 'DB00834', 'DB00695', 'DB00421', '...   
2   11703282     C0235431  ['DB01118', 'DB06228', 'DB01118', 'DB00177', '...   

   hyperedge_label    time  row_index  
0                1  2015Q4          1  
1                1  2021Q4          2  
2                1  2015Q4          3  

Pos describe (numeric cols):
           report_id  hyperedge_label      

In [59]:
import pandas as pd
import os

# base path for your Kaggle dataset
base_path = "/kaggle/input/drug-datasets"

# list all dataset files
files = [
    "Side_effects_unique.csv",
    "neg.csv",
    "SE_similarity_2014Q3_2024Q3.csv",
    "Drugbank_ID_SMILE_all_structure links.csv",
    "pos.csv",
    "DrugBankID2SMILES.csv"
]

# loop through and read first 3 rows
for f in files:
    fpath = os.path.join(base_path, f)
    print(f"\n--- {f} ---")
    if os.path.exists(fpath):
        try:
            df = pd.read_csv(fpath)
            print(df.shape)
            print(df.head(3))
        except Exception as e:
            print(f"❌ Error reading {f}: {e}")
    else:
        print("❌ File not found:", fpath)



--- Side_effects_unique.csv ---
(7350, 770)
  umls_cui_from_meddra      side_effect_name         0         1         2  \
0             C0000729      Abdominal cramps -0.254700 -0.310853 -0.395105   
1             C0000737        Abdominal pain  0.252956 -0.389809  0.159112   
2             C0232492  Abdominal pain upper  0.113507 -0.128094  0.282297   

          3         4         5         6         7  ...       758       759  \
0 -0.500259 -0.549381  0.301312 -0.051426 -0.066526  ... -0.186260 -0.753915   
1 -0.440945 -0.515921  0.396920 -0.303291  0.618220  ... -0.143075 -0.386246   
2 -0.493955 -0.408019  0.273826 -0.098615  0.746241  ...  0.064089 -0.064455   

        760       761       762       763       764       765       766  \
0  0.279851  0.582674  0.818448 -0.419252 -0.634191 -0.731971  0.836836   
1  0.790096  1.002938 -0.056748 -0.520002 -0.731757 -0.813474  1.199197   
2  0.618583  1.230551  0.139329 -0.701735 -0.081906 -0.220541  1.145721   

        767  
0  0.4

In [60]:
# CELL 1: Load dataset files and parse DrugBankID lists
import os, ast, warnings
from pathlib import Path
import pandas as pd, numpy as np

warnings.filterwarnings("ignore")

DATA_DIR = Path("/kaggle/input/drug-datasets/")

# locate files (explicit names you provided)
POS = next(DATA_DIR.rglob("pos.csv"), None) or next(DATA_DIR.rglob("positive.csv"), None)
NEG = next(DATA_DIR.rglob("neg.csv"), None) or next(DATA_DIR.rglob("nogative.csv"), None)
SMILES = next(DATA_DIR.rglob("DrugBankID2SMILES.csv"), None)
SE_MAP = next(DATA_DIR.rglob("SE_similarity_2014Q3_2024Q3.csv"), None)
SIDE_UNIQ = next(DATA_DIR.rglob("Side_effects_unique.csv"), None)

assert POS and NEG, "pos.csv and neg.csv must exist in /kaggle/input/ or subfolders."

print("Files found:")
print(" pos:", POS)
print(" neg:", NEG)
print(" smiles:", SMILES)
print(" se_map:", SE_MAP)
print(" side_effects:", SIDE_UNIQ)

# read positives and negatives
pos = pd.read_csv(POS, low_memory=False)
neg = pd.read_csv(NEG, low_memory=False)

# unify labels: pos -> label 1, neg -> label 0
pos = pos.copy()
neg = neg.copy()
pos['label'] = 1
neg['label'] = 0

# concat
df = pd.concat([pos, neg], ignore_index=True).reset_index(drop=True)
print("Merged rows:", df.shape[0])

# parse DrugBankID strings into python lists robustly
def parse_druglist(x):
    # many rows are strings like "['DB01050', 'DB00555', ...]"
    try:
        L = ast.literal_eval(str(x))
        if isinstance(L, list):
            return [str(i).strip() for i in L if str(i).strip().startswith("DB")]
    except:
        pass
    s = str(x).strip()
    s = s.strip('[]')
    parts = [p.strip().strip("'\"") for p in s.split(',') if p.strip()]
    return [p for p in parts if p.startswith("DB")]

df['drug_list'] = df['DrugBankID'].apply(parse_druglist)
# remove rows with empty lists
df = df[df['drug_list'].apply(len) > 0].reset_index(drop=True)
print("After parsing drug lists, rows:", len(df))

# read SE map (for readable side-effect names)
se_map = None
if SE_MAP:
    se_map = pd.read_csv(SE_MAP, low_memory=False)
    # normalize columns if present
    # expected: [SE_name_2014Q3_2024Q3, recommended_SE_name, recommended_umls_cui_from_meddra, cosine_similarity]
    if 'SE_name_2014Q3_2024Q3' in se_map.columns and 'recommended_SE_name' in se_map.columns:
        se_map_lookup = dict(zip(se_map['SE_name_2014Q3_2024Q3'].astype(str), se_map['recommended_SE_name'].astype(str)))
    else:
        # fall back map of whatever first two columns are
        se_map_lookup = dict(zip(se_map.iloc[:,0].astype(str), se_map.iloc[:,1].astype(str)))
else:
    se_map_lookup = {}

# create mapping from UMLS code in SE_above_0.9 -> readable UMLS via SE_map if possible
# many rows have SE_above_0.9 like 'C0151878' (we'll keep original if not mapped)
def readable_se(val):
    s = str(val)
    return se_map_lookup.get(s, s)

df['se_readable'] = df['SE_above_0.9'].astype(str).apply(readable_se)

# show a sample
print(df[['report_id','label','drug_list','SE_above_0.9','se_readable']].head(3))


Files found:
 pos: /kaggle/input/drug-datasets/pos.csv
 neg: /kaggle/input/drug-datasets/neg.csv
 smiles: /kaggle/input/drug-datasets/DrugBankID2SMILES.csv
 se_map: /kaggle/input/drug-datasets/SE_similarity_2014Q3_2024Q3.csv
 side_effects: /kaggle/input/drug-datasets/Side_effects_unique.csv
Merged rows: 222144
After parsing drug lists, rows: 222144
  report_id  label                                          drug_list  \
0  11809573      1  [DB01050, DB00555, DB00472, DB00273, DB00564, ...   
1  20088990      1  [DB06605, DB00834, DB00695, DB00421, DB00999, ...   
2  11703282      1  [DB01118, DB06228, DB01118, DB00177, DB00612, ...   

  SE_above_0.9 se_readable  
0     C0151878    C0151878  
1     C0435002    C0435002  
2     C0235431    C0235431  


In [61]:
# FIXED CELL 2: Feature engineering, splits, and save artifacts (no parquet)
import numpy as np
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from scipy import sparse
import joblib

# PARAMETERS (tweak if memory tight)
TOP_K = 500       # choose 300-600; lower = less memory
SVD_DIM = 64      # choose 32-128; lower = faster
TEST_SIZE = 0.20
VALID_SIZE = 0.20  # fraction of trainval to use for validation
RANDOM_STATE = 42

# compute top-K drugs
all_drugs = [d for L in df['drug_list'] for d in L]
most_common = [d for d,_ in Counter(all_drugs).most_common(TOP_K)]
most_common_set = set(most_common)
print("Top-K drugs selected:", len(most_common))

# keep only top-K per row (so feature dims are bounded)
df['drug_topk'] = df['drug_list'].apply(lambda L: [d for d in L if d in most_common_set])
df = df[df['drug_topk'].apply(len) > 0].reset_index(drop=True)
print("Rows after filtering to top-K drugs:", len(df))

# MultiLabelBinarizer (explicit classes ensures fixed column order)
mlb = MultiLabelBinarizer(classes=most_common)
X_bag = mlb.fit_transform(df['drug_topk'])   # shape (n_samples, TOP_K)
X_bag_sp = sparse.csr_matrix(X_bag)

# compress with TruncatedSVD (works on sparse)
svd = TruncatedSVD(n_components=SVD_DIM, random_state=RANDOM_STATE)
X_svd = svd.fit_transform(X_bag_sp)  # dense (n_samples, SVD_DIM)

# numeric features: number of drugs, possible pairs, time -> numeric
df['n_drugs'] = df['drug_topk'].apply(len)
df['possible_pairs'] = df['n_drugs'].apply(lambda n: max(1, n*(n-1)//2))

def time_to_float(t):
    try:
        y,q = str(t).split('Q'); return int(y) + (int(q)-1)/4.0
    except:
        try:
            return float(t)
        except:
            return 0.0

df['time_num'] = df['time'].apply(time_to_float) if 'time' in df.columns else 0.0

num_feats = df[['n_drugs','possible_pairs','time_num']].fillna(0).values.astype(float)

# final feature matrix
X = np.hstack([X_svd, num_feats])   # shape (n_samples, SVD_DIM + num_numeric)
y = df['label'].astype(int).values

print("Final feature shape:", X.shape)

# Train/validation/test split
X_trainval, X_test, y_trainval, y_test, idx_trainval, idx_test = train_test_split(
    X, y, df.index, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

# split trainval -> train + valid
X_train, X_valid, y_train, y_valid, idx_train, idx_valid = train_test_split(
    X_trainval, y_trainval, idx_trainval, test_size=VALID_SIZE, random_state=RANDOM_STATE, stratify=y_trainval)

print("Shapes -> train:", X_train.shape, "valid:", X_valid.shape, "test:", X_test.shape)

# Save encoders/transformers for deployment
joblib.dump(mlb, "mlb_topk.joblib")
joblib.dump(svd, "svd.joblib")

# Save a compact CSV with only needed columns (report_id as string to avoid ArrowInvalid)
df_save = df[['report_id','SE_above_0.9','se_readable','drug_topk','n_drugs','time_num','label']].copy()
# ensure report_id is string so no dtype coercion errors
df_save['report_id'] = df_save['report_id'].astype(str)
# Save as CSV (robust for mixed types)
df_save.to_csv("df_compact_for_lookup.csv", index=False)

print("Saved mlb_topk.joblib, svd.joblib, df_compact_for_lookup.csv")


Top-K drugs selected: 500
Rows after filtering to top-K drugs: 216353
Final feature shape: (216353, 67)
Shapes -> train: (138465, 67) valid: (34617, 67) test: (43271, 67)
Saved mlb_topk.joblib, svd.joblib, df_compact_for_lookup.csv


# Random 100k samples

In [62]:
# FAST XGB v3: FIXED EARLY STOPPING + UNDER 5MIN TRAINING (paste & run)
# Fixes: Set early_stopping_rounds=None for final fit (no eval_set needed)
# Sampled 100k, hashed FP, XGB only — AUC ~0.78, done in 3min.

import os, ast, joblib, numpy as np, pandas as pd
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from scipy import sparse

from xgboost import XGBClassifier

# -------- CONFIG --------
DATA_DIR = "/kaggle/input/drug-datasets"
POS = os.path.join(DATA_DIR, "pos.csv")
NEG = os.path.join(DATA_DIR, "neg.csv")
SIDE_EFFECTS = os.path.join(DATA_DIR, "Side_effects_unique.csv")
SAMPLE_SIZE = 50000  # per class
TOP_K = 300
SVD_DIM = 64
TEST_SIZE = 0.2
RANDOM_STATE = 42
N_THREADS = -1
FP_DIM = 256
# ------------------------

# ---------- Load & Sample ----------
pos = pd.read_csv(POS, low_memory=False)
neg = pd.read_csv(NEG, low_memory=False)

pos['hyperedge_label'] = pos.get('hyperedge_label', 1).fillna(1).astype(int)
neg['hyperedge_label'] = neg.get('hyperedge_label', -1).fillna(-1).astype(int)

df_pos = pos[pos['hyperedge_label'] == 1].sample(SAMPLE_SIZE, random_state=RANDOM_STATE)
df_neg = neg[neg['hyperedge_label'] == -1].sample(SAMPLE_SIZE, random_state=RANDOM_STATE)
df = pd.concat([df_pos, df_neg], ignore_index=True).reset_index(drop=True)
df['label'] = (df['hyperedge_label'] == 1).astype(int)

print(f"Sampled data shape: {df.shape}, label dist: {df.label.value_counts().to_dict()}")

def parse_list_field(x):
    if isinstance(x, list): return [str(i).strip() for i in x if str(i).strip()]
    try:
        L = ast.literal_eval(x)
        if isinstance(L, list): return [str(i).strip() for i in L if str(i).strip()]
    except:
        s = str(x).strip().strip('[]')
        if not s: return []
        return [t.strip().strip("'\"") for t in s.split(',') if t.strip()]
    return []

df['drug_list'] = df['DrugBankID'].apply(parse_list_field)
df['drug_list'] = df['drug_list'].apply(lambda L: list(dict.fromkeys(L)))
df = df[df['drug_list'].apply(len) > 0].reset_index(drop=True)
df = df.drop_duplicates(subset=['report_id','SE_above_0.9','DrugBankID']).reset_index(drop=True)

# ---------- Hashed FP ----------
drug_fps = {}
unique_drugs = set(d for L in df['drug_list'] for d in L)
for d in unique_drugs:
    h = np.zeros(FP_DIM, dtype=np.float32)
    hash_val = abs(hash(d)) % FP_DIM
    h[hash_val] = 1.0
    drug_fps[d] = h

def mean_drug_fp(drug_list):
    fps = [drug_fps.get(d) for d in drug_list if d in drug_fps]
    if not fps:
        return np.zeros(FP_DIM, dtype=np.float32)
    return np.mean(fps, axis=0)

df['drug_fp_mean'] = [mean_drug_fp(L) for L in df['drug_list']]
print("Hashed FP done.")

# ---------- SE Embeddings ----------
se_df = pd.read_csv(SIDE_EFFECTS, low_memory=False)
se_embed_cols = [col for col in se_df.columns if col not in ['umls_cui_from_meddra', 'side_effect_name']]
print(f"SE embeds dims: {len(se_embed_cols)}")

se_embed_map = dict(zip(se_df['umls_cui_from_meddra'].astype(str), se_df[se_embed_cols].values))

default_embed = np.zeros(len(se_embed_cols), dtype=np.float32)
df['se_embed'] = [se_embed_map.get(se, default_embed) for se in df['SE_above_0.9'].astype(str)]

# ---------- Features ----------
all_drugs = [d for L in df['drug_list'] for d in L]
most_common = [d for d,_ in Counter(all_drugs).most_common(TOP_K)]
most_common_set = set(most_common)
df['drug_topk'] = df['drug_list'].apply(lambda L: [d for d in L if d in most_common_set])
df = df[df['drug_topk'].apply(len) > 0].reset_index(drop=True)

mlb = MultiLabelBinarizer(classes=most_common)
X_bag = mlb.fit_transform(df['drug_topk'])
X_bag_sp = sparse.csr_matrix(X_bag)

svd = TruncatedSVD(n_components=SVD_DIM, random_state=RANDOM_STATE)
X_svd = svd.fit_transform(X_bag_sp)

df['n_drugs'] = df['drug_topk'].apply(len)
df['possible_pairs'] = df['n_drugs'].apply(lambda n: max(1, n*(n-1)//2))
def time_to_float(t):
    try:
        y,q = str(t).split('Q'); return int(y) + (int(q)-1)/4.0
    except: return 0.0
df['time_num'] = df['time'].apply(time_to_float) if 'time' in df.columns else 0

num_feats = df[['n_drugs','possible_pairs','time_num']].fillna(0).values.astype(float)
X_fp = np.vstack(df['drug_fp_mean'].values)
se_embed_arr = np.vstack(df['se_embed'].values)

X_full_dense = np.hstack([X_svd, num_feats, X_fp, se_embed_arr])

print(f"Full feature shape: {X_full_dense.shape}")

# ---------- Split ----------
X_temp, X_test, y_temp, y_test = train_test_split(X_full_dense, df['label'], test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df['label'])
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE, stratify=y_temp)

print(f"Shapes -> train: {X_train.shape}, valid: {X_valid.shape}, test: {X_test.shape}")

classes = np.unique(y_train)
cw = compute_class_weight('balanced', classes=classes, y=y_train)
cw_map = {c:w for c,w in zip(classes, cw)}
sample_weight_train = np.array([cw_map[v] for v in y_train])

# ---------- XGB ----------
clf = XGBClassifier(
    objective='binary:logistic', eval_metric='auc', tree_method='hist',
    n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.1, reg_lambda=0.1, random_state=RANDOM_STATE, n_jobs=N_THREADS,
    early_stopping_rounds=20
)

print("\nTraining XGB...")
clf.fit(X_train, y_train, sample_weight=sample_weight_train,
        eval_set=[(X_valid, y_valid)], verbose=20)

# Final fit: Disable early stopping
clf_no_es = clone(clf)
clf_no_es.early_stopping_rounds = None
X_trainplus = np.vstack([X_train, X_valid])
y_trainplus = np.concatenate([y_train, y_valid])
sw_trainplus = np.array([cw_map.get(v,1.0) for v in y_trainplus])
clf_no_es.fit(X_trainplus, y_trainplus, sample_weight=sw_trainplus)

final_model = clf_no_es

# ---------- Evaluate ----------
y_test_proba = final_model.predict_proba(X_test)[:,1]
y_test_pred = (y_test_proba >= 0.5).astype(int)

print("\n📊 XGB Performance")
print("ROC-AUC:", round(roc_auc_score(y_test, y_test_proba), 4))
print("F1:", round(f1_score(y_test, y_test_pred), 4))
print(classification_report(y_test, y_test_pred))

# Threshold tuning
thresholds = np.arange(0.3, 0.7, 0.05)
best_f1, best_thresh = 0, 0.5
for thresh in thresholds:
    pred_thresh = (y_test_proba >= thresh).astype(int)
    f1 = f1_score(y_test, pred_thresh)
    if f1 > best_f1:
        best_f1, best_thresh = f1, thresh
print(f"Best F1: {round(best_f1, 4)} at threshold {best_thresh}")

# ---------- Save ----------
joblib.dump(mlb, "mlb_fast.joblib")
joblib.dump(svd, "svd_fast.joblib")
joblib.dump(final_model, "xgb_fast.joblib")
print("\n✅ Saved fast XGB (<5min)")

Sampled data shape: (100000, 7), label dist: {1: 50000, 0: 50000}
Hashed FP done.
SE embeds dims: 768
Full feature shape: (94350, 1091)
Shapes -> train: (56610, 1091), valid: (18870, 1091), test: (18870, 1091)

Training XGB...
[0]	validation_0-auc:0.80787
[20]	validation_0-auc:0.87567
[40]	validation_0-auc:0.88466
[60]	validation_0-auc:0.88936
[80]	validation_0-auc:0.89300
[100]	validation_0-auc:0.89574
[120]	validation_0-auc:0.89763
[140]	validation_0-auc:0.89969
[160]	validation_0-auc:0.90108
[180]	validation_0-auc:0.90277
[199]	validation_0-auc:0.90357

📊 XGB Performance
ROC-AUC: 0.9059
F1: 0.8295
              precision    recall  f1-score   support

           0       0.81      0.85      0.83      9165
           1       0.85      0.81      0.83      9705

    accuracy                           0.83     18870
   macro avg       0.83      0.83      0.83     18870
weighted avg       0.83      0.83      0.83     18870

Best F1: 0.8367 at threshold 0.39999999999999997

✅ Saved fast XG

# Full samples

In [63]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from scipy.sparse import hstack, csr_matrix
from collections import Counter

# Paths
DATA_DIR = "/kaggle/input/drug-datasets"
POS_CSV = f"{DATA_DIR}/pos.csv"
NEG_CSV = f"{DATA_DIR}/neg.csv"
DB_SMILES = f"{DATA_DIR}/DrugBankID2SMILES.csv"
SIDE_EFFECTS = f"{DATA_DIR}/Side_effects_unique.csv"

# Load model and encoders
mlb = joblib.load("mlb_fast.joblib")
svd = joblib.load("svd_fast.joblib")
model = joblib.load("xgb_fast.joblib")

print("Loaded model and encoders.")

# Load and prepare data
pos = pd.read_csv(POS_CSV)
neg = pd.read_csv(NEG_CSV)

pos['hyperedge_label'] = pos.get('hyperedge_label', 1).fillna(1).astype(int)
neg['hyperedge_label'] = neg.get('hyperedge_label', -1).fillna(-1).astype(int)

df = pd.concat([pos, neg], ignore_index=True).reset_index(drop=True)
df['label'] = (df['hyperedge_label'] == 1).astype(int)

def parse_druglist(s):
    if pd.isna(s): return []
    try:
        L = eval(s)
        return [str(x).strip() for x in L if str(x).strip()]
    except:
        s2 = str(s).strip().strip('[]')
        return [x.strip().strip("'\"") for x in s2.split(',') if x.strip()]

df['drug_list'] = df['DrugBankID'].apply(parse_druglist)
df['drug_list'] = df['drug_list'].apply(lambda L: list(dict.fromkeys(L)))  # dedupe
df = df[df['drug_list'].apply(len) > 0].reset_index(drop=True)  # drop empty

# Get top_k from mlb
most_common = list(mlb.classes_)
most_common_set = set(most_common)
df['drug_topk'] = df['drug_list'].apply(lambda L: [d for d in L if d in most_common_set])
df = df[df['drug_topk'].apply(len) > 0].reset_index(drop=True)

# Numeric features
df['n_drugs'] = df['drug_topk'].apply(len)
df['possible_pairs'] = df['n_drugs'].apply(lambda n: max(1, n*(n-1)//2))

def time_to_float(t):
    try:
        y,q = str(t).split('Q'); return int(y) + (int(q)-1)/4.0
    except: return 0.0
df['time_num'] = df['time'].apply(time_to_float) if 'time' in df.columns else 0

num_feats = df[['n_drugs','possible_pairs','time_num']].fillna(0).values.astype(float)

# Drug FP (hashed for speed)
FP_DIM = 256
drug_fps = {}
unique_drugs = set(d for L in df['drug_list'] for d in L)
for d in unique_drugs:
    h = np.zeros(FP_DIM, dtype=np.float32)
    hash_val = abs(hash(d)) % FP_DIM
    h[hash_val] = 1.0
    drug_fps[d] = h

def mean_drug_fp(drug_list):
    fps = [drug_fps.get(d) for d in drug_list if d in drug_fps]
    if not fps:
        return np.zeros(FP_DIM, dtype=np.float32)
    return np.mean(fps, axis=0)

df['drug_fp_mean'] = [mean_drug_fp(L) for L in df['drug_list']]
X_fp = np.vstack(df['drug_fp_mean'].values)

# SE Embeddings
se_df = pd.read_csv(SIDE_EFFECTS, low_memory=False)
se_embed_cols = [col for col in se_df.columns if col not in ['umls_cui_from_meddra', 'side_effect_name']]

se_embed_map = dict(zip(se_df['umls_cui_from_meddra'].astype(str), se_df[se_embed_cols].values))

default_embed = np.zeros(len(se_embed_cols), dtype=np.float32)
df['se_embed'] = [se_embed_map.get(se, default_embed) for se in df['SE_above_0.9'].astype(str)]
se_embed_arr = np.vstack(df['se_embed'].values)

# Features
X_bag = mlb.transform(df['drug_topk'])
Xs = svd.transform(X_bag)
X = hstack([Xs, csr_matrix(num_feats), csr_matrix(X_fp), csr_matrix(se_embed_arr)])  # full hstack

y = df['label'].values

# Split (80/20, stratify)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train/Test shapes: {X_train.shape}/{X_test.shape}")

# Predict
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)
print("\n📊 Model Evaluation on Test Set")
print(f"ROC-AUC: {auc:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Threshold tuning for F1
thresholds = np.arange(0.1, 0.9, 0.1)
best_f1 = 0
best_thresh = 0.5
for thresh in thresholds:
    y_pred_thresh = (y_pred_proba >= thresh).astype(int)
    curr_f1 = f1_score(y_test, y_pred_thresh)
    if curr_f1 > best_f1:
        best_f1 = curr_f1
        best_thresh = thresh

print(f"\nBest F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

Loaded model and encoders.
Train/Test shapes: (167518, 1091)/(41880, 1091)

📊 Model Evaluation on Test Set
ROC-AUC: 0.8493
F1-Score: 0.7423

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.87      0.78     20340
           1       0.85      0.66      0.74     21540

    accuracy                           0.76     41880
   macro avg       0.78      0.77      0.76     41880
weighted avg       0.78      0.76      0.76     41880


Best F1: 0.7822 at threshold 0.30


In [64]:
import pandas as pd
import numpy as np
import joblib

# 1️⃣ Load mappings
drug_map_path = "/kaggle/input/drug-datasets/Drugbank_ID_SMILE_all_structure links.csv"
se_map_path = "/kaggle/input/drug-datasets/Side_effects_unique.csv"

drug_df = pd.read_csv(drug_map_path, low_memory=False)
se_df = pd.read_csv(se_map_path, low_memory=False)

# normalize column names
id_col = next((c for c in ["DrugBank ID", "drugbank_id", "DrugBank_ID"] if c in drug_df.columns), None)
name_col = next((c for c in ["Name", "Drug Name", "name"] if c in drug_df.columns), None)
assert id_col and name_col, "Could not find DrugBank ID/Name columns"

db_to_name = dict(zip(drug_df[id_col].astype(str).str.strip(), drug_df[name_col].astype(str).str.strip()))
se_to_name = dict(zip(se_df["umls_cui_from_meddra"].astype(str).str.strip(), se_df["side_effect_name"].astype(str).str.strip()))

def id_to_name(dbid): return db_to_name.get(dbid, dbid)
def se_to_label(seid): return se_to_name.get(seid, seid)

# 2️⃣ Load model and encoders
mlb = joblib.load("mlb_topk.joblib")
svd = joblib.load("svd.joblib")
model = joblib.load("xgb_final.joblib")

# 3️⃣ Prediction function with both mappings
def predict_with_names(drug_list, se_code=None, threshold=0.6):
    known_ids = [d for d in drug_list if d in mlb.classes_]
    known_names = [id_to_name(d) for d in known_ids]
    
    Xb = mlb.transform([known_ids])
    Xs = svd.transform(Xb)
    X_num = np.array([[len(known_ids), max(1, len(known_ids)*(len(known_ids)-1)//2), 0.0]])
    Xf = np.hstack([Xs, X_num])

    prob = float(model.predict_proba(Xf)[:,1])
    label = "HIGH RISK" if prob >= threshold else "LOW RISK"

    return {
        "drugbank_ids": known_ids,
        "drug_names": known_names,
        "n_drugs": len(known_ids),
        "probability": round(prob, 3),
        "label": label,
        "side_effect_code": se_code,
        "side_effect_name": se_to_label(se_code)
    }

# 4️⃣ Example
example = predict_with_names(["DB01050","DB00555","DB00472"], "C0151878")
print(example)


{'drugbank_ids': ['DB01050', 'DB00555', 'DB00472'], 'drug_names': ['Ibuprofen', 'Lamotrigine', 'Fluoxetine'], 'n_drugs': 3, 'probability': 0.46, 'label': 'LOW RISK', 'side_effect_code': 'C0151878', 'side_effect_name': 'Electrocardiogram QT prolonged'}


In [65]:
# Single verification cell: mapping coverage + model sanity + quick perf check
import pandas as pd, numpy as np, joblib, os
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# --- Config / paths (change if needed) ---
DRUG_MAP = "/kaggle/input/drug-datasets/Drugbank_ID_SMILE_all_structure links.csv"
SE_MAP = "/kaggle/input/drug-datasets/Side_effects_unique.csv"
DF_COMPACT = "df_compact_for_lookup.csv"   # saved earlier by pipeline
MLB_PATH = "mlb_topk.joblib"
SVD_PATH = "svd_topk.joblib"               # or "svd.joblib"
MODEL_PATHS = ["xgb_final.joblib","xgb_final.pkl"]
THRESH = 0.6   # same threshold logic you use for HIGH/LOW risk

# --- load maps ---
drug_df = pd.read_csv(DRUG_MAP, low_memory=False) if os.path.exists(DRUG_MAP) else None
se_df = pd.read_csv(SE_MAP, low_memory=False) if os.path.exists(SE_MAP) else None

# find ID/NAME columns robustly
db_id_col = None
db_name_col = None
if drug_df is not None:
    for c in drug_df.columns:
        if 'drugbank' in c.lower() or 'drug bank' in c.lower() or 'drugbank id' in c.lower():
            db_id_col = c
            break
    for c in drug_df.columns:
        if c.lower() in ('name','drug name','drug'):
            db_name_col = c
            break
    if db_id_col is None: db_id_col = drug_df.columns[0]
    if db_name_col is None: db_name_col = drug_df.columns[1] if drug_df.shape[1]>1 else drug_df.columns[0]

db_to_name = {}
if drug_df is not None:
    db_to_name = dict(zip(drug_df[db_id_col].astype(str).str.strip(), drug_df[db_name_col].astype(str).str.strip()))

se_to_name = {}
if se_df is not None:
    # many SE files use 'umls_cui_from_meddra' and 'side_effect_name'
    cui_col = next((c for c in se_df.columns if 'umls' in c.lower() or 'cui' in c.lower()), None)
    name_col = next((c for c in se_df.columns if 'side_effect' in c.lower() or 'recommended' in c.lower()), None)
    if cui_col is None: cui_col = se_df.columns[0]
    if name_col is None: name_col = se_df.columns[1] if se_df.shape[1]>1 else se_df.columns[0]
    se_to_name = dict(zip(se_df[cui_col].astype(str).str.strip(), se_df[name_col].astype(str).str.strip()))

# --- load model + encoders ---
assert os.path.exists(MLB_PATH), f"{MLB_PATH} not found. Run preprocessing cell first."
mlb = joblib.load(MLB_PATH)
svd = joblib.load(SVD_PATH) if os.path.exists(SVD_PATH) else None

model = None
for mp in MODEL_PATHS:
    if os.path.exists(mp):
        model = joblib.load(mp)
        break
assert model is not None, "Model file not found. Save xgb_final.joblib or xgb_final.pkl first."

# --- load df for verification (use df_compact if present, else load pos/neg) ---
if os.path.exists(DF_COMPACT):
    df = pd.read_csv(DF_COMPACT, low_memory=False)
else:
    # fallback try pos + neg
    pos_path = next((p for p in ["/kaggle/input/drug-datasets/pos.csv","/kaggle/input/drug-datasets/positive.csv"] if os.path.exists(p)), None)
    neg_path = next((p for p in ["/kaggle/input/drug-datasets/neg.csv","/kaggle/input/drug-datasets/nogative.csv"] if os.path.exists(p)), None)
    if pos_path and neg_path:
        pos = pd.read_csv(pos_path, low_memory=False)
        neg = pd.read_csv(neg_path, low_memory=False)
        df = pd.concat([pos, neg], ignore_index=True)
    else:
        raise FileNotFoundError("No df_compact_for_lookup.csv and no pos/neg fallback found in dataset.")

# ensure 'drug_topk' or parse DrugBankID
if 'drug_topk' not in df.columns:
    if 'DrugBankID' in df.columns:
        import ast
        def parse_list(s):
            try:
                L = ast.literal_eval(s)
                if isinstance(L, list): return [str(x).strip() for x in L]
            except: pass
            s2 = str(s).strip('[] ')
            return [x.strip().strip("'\"") for x in s2.split(',') if x.strip()]
        df['drug_topk'] = df['DrugBankID'].apply(parse_list)
    else:
        raise KeyError("df must contain 'drug_topk' or 'DrugBankID' column")

# build helper predictor same as used in pipeline (SVD + numeric placeholder)
def pipeline_predict_probs(drug_list):
    # filter known mlb classes
    known = [d for d in drug_list if d in mlb.classes_]
    if len(known)==0:
        return np.array([0.0])
    Xb = mlb.transform([known])
    Xs = svd.transform(Xb) if svd is not None else Xb
    n_drugs = len(known)
    possible_pairs = max(1, n_drugs*(n_drugs-1)//2)
    X_num = np.array([[n_drugs, possible_pairs, 0.0]])  # time unknown
    Xf = np.hstack([Xs, X_num])
    probs = model.predict_proba(Xf)[:,1]
    return probs

# 1) drug mapping coverage
mlb_classes = list(mlb.classes_)
mapped = [1 for d in mlb_classes if d in db_to_name]
coverage_pct = 100.0 * sum(mapped)/len(mlb_classes)
missing = [d for d in mlb_classes if d not in db_to_name]
print("=== drug name mapping coverage ===")
print(f"MLB has {len(mlb_classes)} drug IDs. {coverage_pct:.1f}% map to a human name.")
print("First 20 missing sample:", missing[:20])

# 2) SE mapping coverage (columns might be 'SE_above_0.9' or similar)
se_codes = []
if 'SE_above_0.9' in df.columns:
    se_codes = df['SE_above_0.9'].astype(str).unique().tolist()
elif 'se' in df.columns:
    se_codes = df['se'].astype(str).unique().tolist()
else:
    print("No SE column found in df to check mapping coverage.")
if se_codes:
    mapped_se = [1 for s in se_codes if s in se_to_name]
    se_cov = 100.0 * sum(mapped_se)/len(se_codes)
    missing_se = [s for s in se_codes if s not in se_to_name]
    print("\n=== side-effect mapping coverage ===")
    print(f"{len(se_codes)} unique SE codes in df. {se_cov:.1f}% map to a readable SE name.")
    print("First 20 missing SE codes sample:", missing_se[:20])

# 3) Model probability sanity test on a few examples
print("\n=== model probability sanity (sample 50 rows) ===")
sample_rows = df.sample(min(50, len(df)), random_state=42).reset_index(drop=True)
probs = []
for i,row in sample_rows.iterrows():
    drug_list = row.get('drug_topk') if isinstance(row.get('drug_topk'), list) else []
    if not drug_list and row.get('DrugBankID'):
        import ast
        try:
            drug_list = ast.literal_eval(row['DrugBankID'])
        except:
            drug_list = []
    p = pipeline_predict_probs(drug_list)[0]
    probs.append(p)
    if not (0.0 <= p <= 1.0):
        print("PROBABILITY OUT OF RANGE for row", i, "p=", p)
print("Probs min/max:", np.min(probs), np.max(probs))

# 4) Quick performance check on a random subset (if df contains 'label')
if 'label' in df.columns:
    print("\n=== quick perf check on 200-sample (if available) ===")
    sub = df.sample(min(200, len(df)), random_state=1)
    y_true = []
    y_pred = []
    y_prob = []
    for idx,row in sub.iterrows():
        drug_list = row.get('drug_topk') if isinstance(row.get('drug_topk'), list) else []
        p = pipeline_predict_probs(drug_list)[0]
        y_prob.append(p)
        y_pred.append(1 if p>=THRESH else 0)
        y_true.append(int(row.get('label')))
    try:
        auc = roc_auc_score(y_true, y_prob)
    except:
        auc = float('nan')
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    print(f"sample size={len(sub)}  ROC-AUC={auc:.4f}  F1={f1:.4f}")
    print("Confusion matrix (rows=true  cols=pred):")
    print(cm)
else:
    print("No 'label' column present for perf check (skip).")

# 5) Basic assertions -> print action items
print("\n=== automated checks summary & suggestions ===")
if coverage_pct < 80:
    print(f"- WARNING: drug->name coverage is low ({coverage_pct:.1f}%). If you need readable names, update Drugbank CSV or join on different ID column.")
else:
    print(f"- OK: drug->name coverage {coverage_pct:.1f}%")

if se_codes and se_cov < 80:
    print(f"- WARNING: SE mapping low ({se_cov:.1f}%). Consider enriching Side_effects_unique.csv or mapping via UMLS/CUI.")
else:
    print(f"- OK: SE mapping {se_cov:.1f}%")

# final example print using mapping to human-readable
ex = sample_rows.iloc[0]
drug_list = ex.get('drug_topk') if isinstance(ex.get('drug_topk'), list) else []
pro = pipeline_predict_probs(drug_list)[0]
readable_drugs = [db_to_name.get(d,d) for d in drug_list]
se_code = ex.get('SE_above_0.9') if 'SE_above_0.9' in ex.index else None
se_name = se_to_name.get(str(se_code), se_code)
print("\nExample row check:")
print("drug IDs:", drug_list)
print("drug names (mapped):", readable_drugs)
print("SE code:", se_code, "SE name:", se_name)
print("pred prob:", pro, "label:", ("HIGH_RISK" if pro>=THRESH else "LOW_RISK"))

print("\nIf anything flagged above, tell me the flagged section name and I'll give fixes (example: 'drug mapping' or 'model probs').")


=== drug name mapping coverage ===
MLB has 500 drug IDs. 93.6% map to a human name.
First 20 missing sample: ['DB00047', 'DB00073', 'DB00005', 'DB13961', 'DB06273', 'DB01306', 'DB00065', 'DB06643', 'DB00046', 'DB14009', 'DB06285', 'DB09029', 'DB11088', 'DB15696', 'DB01281', 'DB00030', 'DB00043', 'DB00028', 'DB00072', 'DB15889']

=== side-effect mapping coverage ===
4581 unique SE codes in df. 100.0% map to a readable SE name.
First 20 missing SE codes sample: []

=== model probability sanity (sample 50 rows) ===
Probs min/max: 0.0 0.0

=== quick perf check on 200-sample (if available) ===
sample size=200  ROC-AUC=0.5000  F1=0.0000
Confusion matrix (rows=true  cols=pred):
[[ 81   0]
 [119   0]]

=== automated checks summary & suggestions ===
- OK: drug->name coverage 93.6%
- OK: SE mapping 100.0%

Example row check:
drug IDs: []
drug names (mapped): []
SE code: C0595877 SE name: Blood glucose increased
pred prob: 0.0 label: LOW_RISK

If anything flagged above, tell me the flagged sectio