In [1]:
import pandas as pd

from rdkit.Chem import CanonSmiles, MolFromSmiles

In [2]:
def load_df(url: str) -> pd.DataFrame:
    df = pd.read_csv(url, usecols=["smiles_r","TOXICITY"])
    df = df.rename(columns=dict(smiles_r="smiles", TOXICITY="is_toxic"))
    # there is an error in the smiles for phosphomyicin - fix that here to preserve the whole test set
    df.loc[df["smiles"] == "CC1OC1[P](=O)(=O)O", "smiles"] = "CC1OC1[P](=O)(O)O"
    # other dataframes also have valency problems - just drop them
    df = df[df["smiles"].map(lambda smi: MolFromSmiles(smi) is not None)]
    df["smiles"] = df["smiles"].map(CanonSmiles)
    return df

In [3]:
testing_df = load_df(r"https://raw.githubusercontent.com/srijitseal/DILI/refs/heads/main/test_data_heldouttest_DILIst_223.csv")

In [4]:
testing_smiles = set(testing_df["smiles"])

In [5]:
finetuning_df = load_df(r"https://raw.githubusercontent.com/srijitseal/DILI/refs/heads/main/data/DILI_Goldstandard_1111.csv")

[13:52:33] Explicit valence for atom # 1 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 15 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 15 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 14 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 6 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 1 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 21 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 3 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 5 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 3 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 7 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 11 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 11 P, 6, is greater than permitted
[13:52:33] Explicit valence for atom # 8 P, 6

In [6]:
print(f"{len(finetuning_df)=} before dropping testing overlap")
finetuning_df = finetuning_df[~finetuning_df["smiles"].isin(testing_smiles)]
print(f"{len(finetuning_df)=} after dropping testing overlap")

len(finetuning_df)=1097 before dropping testing overlap
len(finetuning_df)=874 after dropping testing overlap


In [7]:
finetuning_smiles = set(finetuning_df["smiles"])

In [8]:
pretraining_df = load_df(r"https://raw.githubusercontent.com/srijitseal/DILI/refs/heads/main/data/Alloriginaldata_19911.csv")

[13:52:36] Unusual charge on atom 16 number of radical electrons set to zero
[13:52:39] Unusual charge on atom 16 number of radical electrons set to zero


In [9]:
print(f"{len(pretraining_df)=} before dropping testing overlap")
pretraining_df = pretraining_df[~pretraining_df["smiles"].isin(testing_smiles)]
pretraining_df = pretraining_df[~pretraining_df["smiles"].isin(finetuning_smiles)]
print(f"{len(pretraining_df)=} after dropping testing and finetuning overlap")

len(pretraining_df)=19911 before dropping testing overlap
len(pretraining_df)=18948 after dropping testing and finetuning overlap


In [10]:
testing_df.to_csv("testing.csv", index=False)
pretraining_df.to_csv("pretraining.csv", index=False)
finetuning_df.to_csv("finetuning.csv", index=False)