In [None]:
# Import the CYP data filtered by the ones with IC50 value 
file ="./MDM2_bioactivity_data_raw.csv"
primarydf = pd.read_csv(file)

#Drop NaN in activty column
df_clean = primarydf.dropna(subset=["activity"])
-----------------------------------------
# Deduplication
import sys
sys.path.append("..")

from optunaz.utils.preprocessing.deduplicator import *

col = "canonical_smiles"
# Deduplicate based on canonical_smiles, keeping median values
df_med = KeepMedian().dedup(df_clean, "canonical_smiles")

# Add back assay_chembl_id and remove duplicates again to ensure unique SMILES
df_med = (
    df_med.merge(df_clean[["canonical_smiles", "assay_chembl_id"]], on="canonical_smiles", how="left")
          .drop_duplicates(subset="canonical_smiles")
          .reset_index(drop=True)
)
----------------------------------
# Option1：Random Splitting
# Step 1: Prepare the data
X = df4outer['canonical_smiles']  # SMILES column in your dataframe
y = df4outer['pChEMBL_gt6']  # Target variable (optional)

# Step 2: Perform a random split into train-test sets
train_size_ratio = 0.8  # Define the train-test split ratio (90%-10%)
train_df, test_df = train_test_split(
    df4outer,
    test_size=1-train_size_ratio,
    random_state=20,  # Seed for reproducibility
    stratify=y  # Stratify the split based on the target variable (optional)
)

# Step 3: Reset indices for train and test sets
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Step 4: Verify the splits
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
------------------------------------------------------------

# Option2：Scaffold Splitting
from optunaz.utils.preprocessing.splitter import ScaffoldSplit

# Step 1: Configure the splitter
scaffold_splitter = ScaffoldSplit(
    bins='fd',  # Default binning algorithm
    random_state=20,  # Seed for reproducibility
    make_scaffold_generic=True,  # Make Murcko scaffolds gZeneric
    butina_cluster=0.4,  # Clustering threshold to aggregate scaffolds
    name='ScaffoldSplit'
)

# Step 2: Prepare your data
X = df4outer['canonical_smiles']  # SMILES column in your dataframe
y = df4outer['pChEMBL_gt6']  # Target variable (optional, for stratification)

# Step 3: Generate scaffold groups
scaffold_groups = scaffold_splitter.groups(df4outer, smiles_col='canonical_smiles')
scaffold_groups_series = pd.Series(scaffold_groups)


# Step 4: Split the data into train-test indices using the scaffold groups
train_indices, test_indices = scaffold_splitter.split(X, y, groups=scaffold_groups_series)

# Step 5: Adjust train-test split ratio to 90%-10%
train_size = int(len(df4outer) * 0.9)  # Calculate 90% of the data
test_size = len(df4outer) - train_size  # Calculate remaining 10%

# Select the first `train_size` samples for training and the rest for testing
train_indices = train_indices[:train_size]
test_indices = test_indices[:test_size]

# Step 6: Create train and test datasets
train_df = df4outer.iloc[train_indices].reset_index(drop=True)
test_df = df4outer.iloc[test_indices].reset_index(drop=True)
-----------------------------------------------------

# Data Augmentation
#Step 1: Gather a Large Pool of Compounds
file_path = "C:\\Users\\jen\\Downloads\\模型\\data_5cs_smiles.txt" 
# Load the dataset with column names
columns = [
    "Uniprot_Accession", "Molecule_ChEMBL_ID", "canonical_smiles", 
    "standard_inchi_key", "activity", "potential_duplicate", 
    "standard_type", "doc_id", "src_id", "src_description", "src_short_name"
]

pool_df = pd.read_csv(file_path, delimiter="\t", names=columns, header=0)
# Fraction of molecules to select
fraction_to_sample = 0.2  # 20% of the molecules

# Create a random sample of 20% of the molecules
sub_library_df = pool_df1.sample(frac=fraction_to_sample, random_state=42).reset_index(drop=True)

# Save the sub-library to a CSV file (optional)
sub_library_df.to_csv("sub_library_pool_df1.csv", index=False)

# Output the size of the new sub-library
print(f"Generated a sub-library with {len(sub_library_df)} molecules (20% of pool_df1).")

# calculate number of putative inactive required
num_active = len(train_df[train_df["pChEMBL_gt6"] == 1].copy())
num_inactive = len(train_df[train_df["pChEMBL_gt6"] == 0].copy())
desired_num_inactives = num_active *100 - num_inactive # Target 100:1 ratio
print(f"The number of active: {num_active}, The number of inactive: {num_inactive}, Desired number of inactives: {desired_num_inactives}")

# Generating fingerprint in advance
from tqdm import tqdm  # 导入 tqdm
from rdkit import Chem
from rdkit.Chem import AllChem

def get_fingerprint(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    except:
        return None

tqdm.pandas(desc="Initializing tqdm_pandas")

active_cyp_p450 = df_med[df_med["pChEMBL_gt6"] == 1].copy()
inactive_cyp_p450 = df_med[df_med["pChEMBL_gt6"] == 0].copy()

print("Generating fingerprints for active compounds...")
active_cyp_p450["fingerprint"] = active_cyp_p450["canonical_smiles"].progress_apply(get_fingerprint)
print("Generating fingerprints for inactive compounds...")
inactive_cyp_p450["fingerprint"] = inactive_cyp_p450["canonical_smiles"].progress_apply(get_fingerprint)


# Initialize the selected inactives list
selected_inactives = []

# Iterate over pool_df1 dynamically
for _, row in tqdm(sub_library_df.iterrows(), total=len(sub_library_df), desc="Processing pool"):
    smiles = row["canonical_smiles"]

    # Dynamically compute the fingerprint
    fingerprint = get_fingerprint(smiles)
    if fingerprint is None:
        continue  # Skip if fingerprint cannot be generated

    # Calculate max similarity to active molecules
    max_similarity_active = max(DataStructs.BulkTanimotoSimilarity(fingerprint, active_fps))

    # Skip if too similar to active compounds
    if max_similarity_active < 0.4:
        # Calculate max similarity to inactive molecules
        max_similarity_inactive = max(DataStructs.BulkTanimotoSimilarity(fingerprint, inactive_fps))

        # Check the threshold for inactive similarity
        if max_similarity_inactive <= 0.99:
            selected_inactives.append(row)

    # Stop if desired number of inactives is reached
    if len(selected_inactives) >= desired_num_inactives:
        break
selected_inactives_df = pd.DataFrame(selected_inactives)
selected_inactives_df["pChEMBL_gt6"] = 0
import pandas as pd
from sklearn.utils import shuffle  # For shuffling the DataFrame

# Combine the DataFrames
final_training_set = pd.concat([train_df, selected_inactives_df], ignore_index=True)

# Shuffle the final training set (optional)
final_training_set = shuffle(final_training_set, random_state=42).reset_index(drop=True)


# Select only the specified columns
final_training_set = final_training_set[["canonical_smiles", "activity", "pChEMBL_gt6"]]
---------------------------------------------------------------------------
# Split the combined training set into Training Subset and Selection Pool
from sklearn.model_selection import train_test_split
train_set, selection_pool = train_test_split(
    final_training_set, 
    test_size=0.9,  # 90% for selection pool
    stratify=final_training_set['pChEMBL_gt6'],  # Stratify based on the 'activity' column
    random_state=20  # Ensure reproducibility
)


print(f"Training Subset size: {len(train_set)}, Selection Pool size: {len(selection_pool)}")

# Save the training subset to a CSV file
train_set.to_csv("./Data/MDM2_train_set_20rd.csv", index=False) 

# Save the selection pool to a CSV file
selection_pool.to_csv("./Data/MDM2_selection_pool_20rd.csv", index=False)

test_set.to_csv("./Data/MDM2_test_set_20rd.csv", index=False)