In [12]:
import numpy as np
print(np.__version__)


1.26.4


In [13]:
import sys
print(sys.executable)
print(sys.version)
print(sys.path)


/home/nbilic/miniconda3/envs/Nandos/bin/python
3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 16:33:10) 
[GCC 12.3.0]
['/home/nbilic/miniconda3/envs/Nandos/lib/python39.zip', '/home/nbilic/miniconda3/envs/Nandos/lib/python3.9', '/home/nbilic/miniconda3/envs/Nandos/lib/python3.9/lib-dynload', '', '/home/nbilic/miniconda3/envs/Nandos/lib/python3.9/site-packages']


In [14]:
import pandas as pd
import numpy as np
import os
import json
from rdkit import Chem
from rdkit.Chem import AllChem


In [7]:
# --- 1. Load and pivot GI50 data ---
gi50_raw = pd.read_csv("GI50.csv")  

gi50_df = gi50_raw.pivot_table(
    index="NSC",
    columns="CELL_NAME",
    values="AVERAGE",
    aggfunc="mean"
).reset_index()

gi50_df.columns.name = None  # Optional: remove "CELL_NAME" header level

# --- 2. Load SMILES data ---
smiles_df = pd.read_csv("nsc_smiles.csv")  # Should have 'NSC', 'SMILES'

# --- 3. Merge on NSC to get full dataset ---
merged_df = pd.merge(gi50_df, smiles_df, on="NSC")
merged_df["SMILES"] = merged_df["SMILES"].astype(str)


In [91]:
from collections import Counter
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

def categorize_gi50(val):
    if pd.isna(val):
        return None
    p = -val                        # convert −log10 → +log10
    if p < 4.1:   return 0
    elif p < 5.0: return 1
    elif p < 6.0: return 2
    elif p < 7.0: return 3
    elif p < 8.0: return 4
    else:         return 5

# Build dictionary: NSC → {SMILES, concentrations, potencies, label}
mol_dict = {}

gi50_columns = [c for c in merged_df.columns if c not in ("NSC", "SMILES")]

for _, row in merged_df.iterrows():
    NSC   = row["NSC"]
    SMILES = row["SMILES"]

    # per-cell-line numbers and categories
    concentrations = {col: row[col]               for col in gi50_columns}
    potencies      = {col: categorize_gi50(row[col]) for col in gi50_columns}

    # majority label, ignoring missing values
    cats = [v for v in potencies.values() if v is not None]
    if not cats:                       # skip compounds with no data
        continue
    majority_label = Counter(cats).most_common(1)[0][0]

    mol_dict[NSC] = {
        "NSC": NSC,
        "SMILES": SMILES,
        "mol_concentration": concentrations,
        "cancer_type": gi50_columns,
        "potency": potencies,
        "label": [potencies[cl] if potencies[cl] is not None else -1 for cl in gi50_columns],

    }


In [92]:
import numpy as np
import os
import json

np.random.seed(42)
NSC_list = list(mol_dict.keys())
np.random.shuffle(NSC_list)

n = len(NSC_list)
n_train = int(0.8 * n)
n_val = int(0.1 * n)

splits = {
    "train": NSC_list[:n_train],
    "val": NSC_list[n_train:n_train + n_val],
    "test": NSC_list[n_train + n_val:]
}


In [8]:
import shutil

# Clear old folders
for folder in ["train", "val", "test"]:
    if os.path.exists(folder):
        shutil.rmtree(folder)  # Delete the folder and all contents
    os.makedirs(folder)  # Recreate empty folder


In [94]:
# Ensure folders exist
for folder in ["train", "val", "test"]:
    os.makedirs(folder, exist_ok=True)

# Save each molecule as its own JSON file
for split_name, NSCs in splits.items():
    for NSC in NSCs:
        record = mol_dict[NSC]
        filename = os.path.join(split_name, f"{NSC}.json")
        with open(filename, "w") as f:
            json.dump({
                "NSC": record["NSC"],
                "SMILES": record["SMILES"],
                "label": record["label"]
            }, f)


OSError: [Errno 122] Disk quota exceeded

In [None]:
import gzip, json

n_preview = 5                 # change to see more / fewer
with gzip.open("train.jsonl.gz", "rt") as f:
    for i, line in enumerate(f):
        if i >= n_preview:
            break
        record = json.loads(line)
        print(f"--- record {i} ---")
        print(json.dumps(record, indent=2))
        print()

