In [34]:
import deepchem as dc
from deepchem.molnet import load_toxcast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the ToxCast dataset
tasks, datasets, transformers = load_toxcast()

# Unpack the datasets
train_dataset, valid_dataset, test_dataset = datasets

# Print basic dataset information
print("=== ToxCast Dataset Overview ===")
print(f"Number of tasks/targets: {len(tasks)}")
print(f"Training samples: {train_dataset.X.shape[0]}")
print(f"Validation samples: {valid_dataset.X.shape[0]}")
print(f"Test samples: {test_dataset.X.shape[0]}")
print(f"Total samples: {train_dataset.X.shape[0] + valid_dataset.X.shape[0] + test_dataset.X.shape[0]}")
print(f"Number of features: {train_dataset.X.shape[1]}")

# Display first few task names
print("\n=== First 10 Target Tasks ===")
for i, task in enumerate(tasks[:10]):
    print(f"{i+1}. {task}")
print("..." if len(tasks) > 10 else "")

# Display one sample with its features and labels
print("\n=== Sample Data (First Example) ===")
print("Features (first 10 values):")
print(train_dataset.X[0][:10])
print(f"Feature shape: {train_dataset.X[0].shape}")
print("\nLabels (first 10 values):")
print(train_dataset.y[0][:10])
print(f"Label shape: {train_dataset.y[0].shape}")

# Check for missing values in the first sample
print("\n=== Missing Values in First Sample ===")
nan_count = np.sum(np.isnan(train_dataset.y[0]))
print(f"Number of NaN values: {nan_count} out of {train_dataset.y[0].shape[0]}")

# Display SMILES identifier (if available)
if hasattr(train_dataset, 'ids') and len(train_dataset.ids) > 0:
    print("\n=== Sample Molecule ID ===")
    print(f"First molecule ID: {train_dataset.ids[0]}")


[06:12:12] Explicit valence for atom # 0 F, 1, is greater than permitted
Failed to featurize datapoint 1039, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[06:12:13] Explicit valence for atom # 2 Cl, 1, is greater than permitted
Failed to featurize datapoint 1789, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[06:12:13] Explicit valence for atom # 0 Cl, 1, is greater than permitted
Failed to featurize datapoint 1881, Non

=== ToxCast Dataset Overview ===
Number of tasks/targets: 617
Training samples: 6863
Validation samples: 858
Test samples: 858
Total samples: 8579
Number of features: 1024

=== First 10 Target Tasks ===
1. ACEA_T47D_80hr_Negative
2. ACEA_T47D_80hr_Positive
3. APR_HepG2_CellCycleArrest_24h_dn
4. APR_HepG2_CellCycleArrest_24h_up
5. APR_HepG2_CellCycleArrest_72h_dn
6. APR_HepG2_CellLoss_24h_dn
7. APR_HepG2_CellLoss_72h_dn
8. APR_HepG2_MicrotubuleCSK_24h_dn
9. APR_HepG2_MicrotubuleCSK_24h_up
10. APR_HepG2_MicrotubuleCSK_72h_dn
...

=== Sample Data (First Example) ===
Features (first 10 values):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Feature shape: (1024,)

Labels (first 10 values):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label shape: (617,)

=== Missing Values in First Sample ===
Number of NaN values: 0 out of 617

=== Sample Molecule ID ===
First molecule ID: C[SiH](C)O[Si](C)(C)O[Si](C)(C)O[SiH](C)C


In [38]:
load_toxcast()

(['ACEA_T47D_80hr_Negative',
  'ACEA_T47D_80hr_Positive',
  'APR_HepG2_CellCycleArrest_24h_dn',
  'APR_HepG2_CellCycleArrest_24h_up',
  'APR_HepG2_CellCycleArrest_72h_dn',
  'APR_HepG2_CellLoss_24h_dn',
  'APR_HepG2_CellLoss_72h_dn',
  'APR_HepG2_MicrotubuleCSK_24h_dn',
  'APR_HepG2_MicrotubuleCSK_24h_up',
  'APR_HepG2_MicrotubuleCSK_72h_dn',
  'APR_HepG2_MicrotubuleCSK_72h_up',
  'APR_HepG2_MitoMass_24h_dn',
  'APR_HepG2_MitoMass_24h_up',
  'APR_HepG2_MitoMass_72h_dn',
  'APR_HepG2_MitoMass_72h_up',
  'APR_HepG2_MitoMembPot_1h_dn',
  'APR_HepG2_MitoMembPot_24h_dn',
  'APR_HepG2_MitoMembPot_72h_dn',
  'APR_HepG2_MitoticArrest_24h_up',
  'APR_HepG2_MitoticArrest_72h_up',
  'APR_HepG2_NuclearSize_24h_dn',
  'APR_HepG2_NuclearSize_72h_dn',
  'APR_HepG2_NuclearSize_72h_up',
  'APR_HepG2_OxidativeStress_24h_up',
  'APR_HepG2_OxidativeStress_72h_up',
  'APR_HepG2_StressKinase_1h_up',
  'APR_HepG2_StressKinase_24h_up',
  'APR_HepG2_StressKinase_72h_up',
  'APR_HepG2_p53Act_24h_up',
  'APR_Hep

In [39]:
import deepchem as dc
from deepchem.molnet import load_toxcast
import numpy as np
import pandas as pd

# Load the ToxCast dataset
tasks, datasets, transformers = load_toxcast()

# Unpack the datasets
train_dataset, valid_dataset, test_dataset = datasets

# Combine all datasets
all_X = np.vstack([train_dataset.X, valid_dataset.X, test_dataset.X])
all_y = np.vstack([train_dataset.y, valid_dataset.y, test_dataset.y])
all_ids = np.concatenate([train_dataset.ids, valid_dataset.ids, test_dataset.ids])

# Create feature column names
feature_columns = [f'feature_{i}' for i in range(all_X.shape[1])]

# Create a DataFrame with features, labels, and IDs
df = pd.DataFrame(all_X, columns=feature_columns)
df[tasks] = all_y
df['molecule_id'] = all_ids

# Save the DataFrame to a CSV file
csv_filename = 'toxcast_dataset.csv'
df.to_csv(csv_filename, index=False)

print(f"Dataset saved to {csv_filename}")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")


  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y
  df[tasks] = all_y


Dataset saved to toxcast_dataset.csv
Total rows: 8579
Total columns: 1642
