In [1]:
import sys
import numpy as np
import pandas as pd
import numpy as np
import os
import json
from rdkit import Chem
from rdkit.Chem import AllChem


In [2]:
NCI60_CANONICAL = [
    # Leukemia (6)
    "CCRF-CEM", "HL-60(TB)", "K-562", "MOLT-4", "RPMI-8226", "SR",
    # CNS (6)
    "SF-268", "SF-295", "SF-539", "SNB-19", "SNB-75", "U251",
    # Colon (7)
    "COLO 205", "HCC-2998", "HCT-116", "HCT-15", "HT29", "KM12", "SW-620",
    # Non‑small‑cell lung (9)
    "A549/ATCC", "EKVX", "HOP-62", "HOP-92",
    "NCI-H226", "NCI-H23", "NCI-H322M", "NCI-H460", "NCI-H522",
    # Melanoma (9)
    "LOX IMVI", "MALME-3M", "M14", "MDA-MB-435",
    "SK-MEL-2", "SK-MEL-28", "SK-MEL-5", "UACC-257", "UACC-62",
    # Ovarian (7)
    "IGROV1", "OVCAR-3", "OVCAR-4", "OVCAR-5", "OVCAR-8", "NCI/ADR-RES", "SK-OV-3",
    # Renal (8)
    "786-0", "A498", "ACHN", "CAKI-1", "RXF 393", "SN12C", "TK-10", "UO-31",
    # Breast (6)
    "BT-549", "HS 578T", "MCF7", "T-47D", "MDA-MB-231/ATCC", "MDA-MB-468",
    # Prostate (2)
    "PC-3", "DU-145",
]
assert len(NCI60_CANONICAL) == 60


In [3]:
# --- 1. Load and pivot GI50 data ------------------------------------------
import pandas as pd

gi50_raw = pd.read_csv("GI50.csv")

# Pivot to wide table (one column per cell line)
gi50_df = (
    gi50_raw
    .pivot_table(index="NSC", columns="CELL_NAME", values="AVERAGE", aggfunc="mean")
    .reset_index()
)
gi50_df.columns.name = None

# Keep only the 60 canonical columns (plus NSC)
keep_cols = ["NSC"] + NCI60_CANONICAL
gi50_df = gi50_df.loc[:, gi50_df.columns.isin(keep_cols)]

# --- 2. Load SMILES data ---------------------------------------------------
smiles_df = pd.read_csv("nsc_smiles.csv")          # must contain 'NSC', 'SMILES'

# --- 3. Merge on NSC to get full dataset -----------------------------------
merged_df = pd.merge(gi50_df, smiles_df, on="NSC")
merged_df["SMILES"] = merged_df["SMILES"].astype(str)

# Save the order once so loaders / models can reuse it
import json, pathlib
pathlib.Path("cell_line_order.json").write_text(
    json.dumps(NCI60_CANONICAL, indent=2)
)
print(f"Resulting dataframe shape: {merged_df.shape}")   



Resulting dataframe shape: (58329, 62)


In [4]:
# --- Build dictionary: NSC → {SMILES, label_vector} ------------------------
from collections import Counter
import pandas as pd

def categorize_gi50(val):
    if pd.isna(val):
        return None
    p = -val                         # convert −log10 → +log10
    if p < 4.1:   return 0
    elif p < 5.0: return 1
    elif p < 6.0: return 2
    elif p < 7.0: return 3
    elif p < 8.0: return 4
    else:         return 5

mol_dict      = {}                                          # master store
gi50_columns  = [c for c in merged_df.columns if c not in ("NSC", "SMILES")]

for _, row in merged_df.iterrows():
    NSC    = row["NSC"]
    SMILES = row["SMILES"]

    potencies = {cl: categorize_gi50(row[cl]) for cl in gi50_columns}
    cats      = [v for v in potencies.values() if v is not None]
    if not cats:                                    # skip compounds w/ no data
        continue

    mol_dict[NSC] = {
        "smiles"       : SMILES,                                   # keep
        "label_vector" : [potencies[cl] if potencies[cl] is not None else -1
                          for cl in gi50_columns]                  # keep
        # NOTE: NSC, mol_concentration, potency *not* stored
    }

print(f"Parsed {len(mol_dict):,} usable compounds.")



Parsed 58,329 usable compounds.


In [5]:
import numpy as np

np.random.seed(42)
NSC_list = list(mol_dict.keys())
np.random.shuffle(NSC_list)

n       = len(NSC_list)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)

splits = {
    "train": NSC_list[:n_train],
    "val"  : NSC_list[n_train:n_train + n_val],
    "test" : NSC_list[n_train + n_val:]
}

for s, idxs in splits.items():
    print(f"{s:<5}  {len(idxs):,}")


train  46,663
val    5,832
test   5,834


In [6]:
import gzip, json, os, shutil

# Remove any old per‑compound directories to free space
for folder in ["train", "val", "test"]:
    if os.path.isdir(folder):
        shutil.rmtree(folder)

# Open three writers once
writers = {
    "train": gzip.open("train.jsonl.gz", "wt"),
    "val"  : gzip.open("val.jsonl.gz",   "wt"),
    "test" : gzip.open("test.jsonl.gz",  "wt"),
}


for split_name, NSCs in splits.items():
    fh = writers[split_name]
    for NSC in NSCs:
        fh.write(json.dumps(mol_dict[NSC]) + "\n")


for fh in writers.values():
    fh.close()

print("Finished writing: ",
      ", ".join(f"{p} ({os.path.getsize(p)/1e6:.1f} MB)"
                for p in ["train.jsonl.gz","val.jsonl.gz","test.jsonl.gz"]))



Finished writing:  train.jsonl.gz (1.8 MB), val.jsonl.gz (0.2 MB), test.jsonl.gz (0.2 MB)


In [1]:
#EXAMPLES
#Modify n_preview to get more/less molecules

import gzip, json

n_preview = 1                 
with gzip.open("train.jsonl.gz", "rt") as f:
    for i, line in enumerate(f):
        if i >= n_preview:
            break
        record = json.loads(line)
        print(f"--- record {i} ---")
        print(json.dumps(record, indent=2))
        print()


--- record 0 ---
{
  "smiles": "CCC(C)C1=C(O)[N+]2([O-])Oc3ccc(cc3OC2(C(C)CC)C(=O)N1O)C4=C(O)C(=C(C(=O)C4=O)c5ccc(O)cc5)O",
  "label_vector": [
    1,
    2,
    2,
    1,
    2,
    2,
    1,
    1,
    1,
    1,
    2,
    2,
    2,
    1,
    2,
    2,
    2,
    1,
    1,
    2,
    1,
    2,
    2,
    1,
    2,
    1,
    2,
    -1,
    1,
    2,
    1,
    1,
    2,
    1,
    2,
    2,
    0,
    0,
    2,
    2,
    2,
    2,
    2,
    1,
    1,
    -1,
    0,
    2,
    -1,
    -1,
    2,
    0,
    2,
    1,
    -1,
    2,
    2,
    1,
    2,
    1
  ]
}

