In [2]:
import scanpy as sc

adata = sc.read_h5ad(r"C:\Users\PRASANTH\CellScribe\dataset\cellxgene.h5ad")

In [3]:
def make_text(row):
    return (
        f"{row['cluster_label']}. "
        f"Condition: {row['condition']}. "
        f"Replicate: {row['replicate']}."
    )
texts = adata.obs.apply(make_text, axis=1).tolist()


In [None]:
X = adata.X 


In [5]:
gene_names = adata.var["gene_name"].tolist()

In [None]:
dataset = {
    "X": X,                 
    "text": texts,          
    "gene_names": gene_names
}

In [None]:
import numpy as np
import json
from sklearn.model_selection import train_test_split
from scipy import sparse


X = adata.X
if not sparse.issparse(X):
    X = sparse.csr_matrix(X)

texts = texts
gene_names = gene_names

n = X.shape[0]
indices = np.arange(n)

# 70% train, 30% temp
train_idx, temp_idx = train_test_split(
    indices, test_size=0.30, random_state=42, shuffle=True
)

# split temp into 10% val, 20% test
val_idx, test_idx = train_test_split(
    temp_idx, test_size=2/3, random_state=42, shuffle=True
)

def save_split(name, idx):
    sparse.save_npz(f"{name}_X.npz", X[idx])
    with open(f"{name}_text.json", "w") as f:
        json.dump([texts[i] for i in idx], f)
    with open(f"{name}_genes.json", "w") as f:
        json.dump(gene_names, f)

save_split("train", train_idx)
save_split("val", val_idx)
save_split("test", test_idx)

print("Saved train / val / test splits")

âœ… Saved train / val / test splits
