In [23]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.Scaffolds import MurckoScaffold
from itertools import accumulate, chain

In [8]:
data = pd.read_csv('Path to data CSV file')

In [22]:
def standardize_mol(mol):
    clean_mol = rdMolStandardize.Cleanup(mol)
    parent_mol = rdMolStandardize.FragmentParent(clean_mol)
    uncharger = rdMolStandardize.Uncharger()
    uncharged_mol = uncharger.uncharge(parent_mol)
    return uncharged_mol

def get_canonical_smiles(mol):
    return Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

def find_duplicates(smiles_df, smiles_col='smiles', id_col='cid'):
    canonical_smiles_dict = {}
    duplicates = []
    unique_smiles_with_ids = []

    for index, row in smiles_df.iterrows():
        id = row[id_col]
        smiles = row[smiles_col]
        mol = Chem.MolFromSmiles(smiles)
        
        if mol is not None:
            std_mol = standardize_mol(mol)
            canonical_smiles = get_canonical_smiles(std_mol)
            
            if canonical_smiles in canonical_smiles_dict:
                # Append the ID of the duplicate
                duplicates.append((id, canonical_smiles_dict[canonical_smiles]))
            else:
                # Store the unique SMILES along with its ID
                unique_smiles_with_ids.append((id, canonical_smiles))
                canonical_smiles_dict[canonical_smiles] = id

    return unique_smiles_with_ids, duplicates

In [14]:
unique_smiles, duplicates = find_duplicates(data, smiles_col='smiles', id_col='cid')

In [None]:
print(f'The number of unique smiles is {len(unique_smiles)}')
print(f'The number of duplicates smiles is {len(duplicates)}')

In [24]:

from sklearn.metrics import f1_score,average_precision_score

def scaffold_split(data, smiles_col, test_size = 0.2,random_state = 42):
    """
    Split a molecule dataset into training and test sets based on scaffolds.

    Parameters:
    - data (pd.DataFrame): The dataset containing molecule data.
    - smiles_col (str): The name of the column containing SMILES strings.
    - test_size (float): Proportion of the dataset to include in the test split.
    - random_state (int): Random state for reproducibility.

    Returns:
    - data_train (pd.DataFrame): Training set.
    - data_test (pd.DataFrame): Test set.
    """
    scaffolds = {}
    for idx, row in data.iterrows():
        smiles = row[smiles_col]
        mol = Chem.MolFromSmiles(smiles)
        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
        if scaffold not in scaffolds:
            scaffolds[scaffold] = [idx]
        else:
            scaffolds[scaffold].append(idx)

    scaffold_lists = list(scaffolds.values())
    np.random.seed(random_state)
    np.random.shuffle(scaffold_lists)

    num_molecules = len(data)
    num_test = int(np.floor(test_size * num_molecules))
    train_idx, test_idx = [], []
    for scaffold_list in scaffold_lists:
        if len(test_idx) + len(scaffold_list) <= num_test:
            test_idx.extend(scaffold_list)
        else:
            train_idx.extend(scaffold_list)

    data_train = data.iloc[train_idx]
    data_test = data.iloc[test_idx]

    return data_train, data_test

In [None]:
data_train, data_test = scaffold_split(combine_df, 'smiles', random_state=40)

In [None]:
# Save the DataFrames to CSV files
data_train.to_csv('Path to csv data_train.csv', index=False)
data_test.to_csv('Path to csv data_test.csv', index=False)

In [None]:
mol_train = [Chem.MolFromSmiles(x) for x in data_train['smiles']]
mol_test= [Chem.MolFromSmiles(x) for x in data_test['smiles']]


fp_train= [AllChem.GetMorganFingerprintAsBitVect(x,radius=2,nBits=2048) for x in mol_train]

fp_test= [AllChem.GetMorganFingerprintAsBitVect(x,radius=2,nBits=2048) for x in mol_test]


size_x= len(fp_train)
size_y= len(fp_test)

print(size_x)
print(size_y)


similarity_matrix = np.zeros((size_y, size_x))
similarity_matrix.shape

idx = 0
np_fps = list()
for fp in fp_test:
    np_fp = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp, np_fp)
    np_fps.append(np_fp)
    # Calculate Tanimoto similarity
    similarity = Chem.DataStructs.BulkTanimotoSimilarity(fp, fp_train)
    # Save it to similarity matrix
    similarity_matrix[idx] = similarity
    idx += 1


df_similarity = pd.DataFrame(similarity_matrix)

df_similarity = pd.DataFrame(similarity_matrix)
df_similarity.columns = list(data_train['mol_id'])
df_similarity.index = data_test['mol_id']
df_similarity


import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(11.7,8.27)})
fig, ax = plt.subplots(dpi=300, figsize=(7,5))
ax = sns.heatmap(df_similarity, vmin=0, vmax=1,
                yticklabels=False, xticklabels=False,cmap="coolwarm")
ax.set_xlabel("Train", fontsize = 15)
ax.set_ylabel("Test", fontsize = 15)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Extract upper triangle values excluding the diagonal
tanimoto_coefficients = np.array(similarity_matrix)[np.triu_indices_from(np.array(similarity_matrix), k=1)]

# Define the ranges for Tanimoto coefficients
ranges = np.arange(0, 1.1, 0.1)
accumulated_proportion = []

# Calculate accumulated proportion for each range
for i in ranges:
    proportion = np.sum(tanimoto_coefficients < i) / len(tanimoto_coefficients)
    accumulated_proportion.append(proportion)

# Create a single figure
fig, ax_tanimoto = plt.subplots(figsize=(8, 6))

# Plotting the data
ax_tanimoto.plot(ranges, accumulated_proportion, 'o-', color='blue', linewidth=2, markersize=8, label='Accumulated Proportion')

# Set axis labels with larger font size
ax_tanimoto.set_xlabel('Tanimoto Coefficients Range', fontsize=16, fontweight='bold')
ax_tanimoto.set_ylabel('Accumulated Proportion', fontsize=16, fontweight='bold')

# Set the title with larger font size
ax_tanimoto.set_title('Tanimoto Coefficients Between Training and Test Active Scaffold', 
          fontsize=18, fontweight='bold')

# Customize x-ticks (improved formatting)
x_labels = [f'[{i:.1f}, {i + 0.1:.1f})' for i in np.arange(0, 1, 0.1)] + ['[1.0, 1.0]']
ax_tanimoto.set_xticks(ranges)
ax_tanimoto.set_xticklabels(x_labels, rotation=45, ha='right', fontsize=14, color='black')

# Customize y-ticks
ax_tanimoto.tick_params(axis='y', labelsize=14, colors='black')

# Add grid lines for better readability
ax_tanimoto.grid(True, linestyle='--', alpha=0.7)

# Add black borders to all four sides
for spine in ax_tanimoto.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(2)

# Customize major ticks (improved consistency)
ax_tanimoto.tick_params(axis='both', which='major', direction='in', length=8, width=2, colors='black')

# Add a legend
ax_tanimoto.legend(fontsize=14, loc='lower right')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
scaffolds = {}
for idx, smi in zip(data_train['cid'],data_train['smiles']):
    mol = Chem.MolFromSmiles(smi)
    scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
    if scaffold not in scaffolds:
        scaffolds[scaffold] = [idx]
    else:
        scaffolds[scaffold].append(idx)

dfscaffolds = pd.DataFrame(list(scaffolds.items()), columns=['scaffold', 'cid'])
dfscaffolds.to_csv('/home/juni/working/mettl3/train_scaffolds.csv', index=False)
dfscaffolds

In [None]:
scaffolds = {}
for idx, smi in zip(data_test['cid'],data_test['smiles']):
    mol = Chem.MolFromSmiles(smi)
    scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
    if scaffold not in scaffolds:
        scaffolds[scaffold] = [idx]
    else:
        scaffolds[scaffold].append(idx)

dfscaffolds = pd.DataFrame(list(scaffolds.items()), columns=['scaffold', 'cid'])
dfscaffolds.to_csv('/home/juni/working/mettl3/test_scaffolds.csv', index=False)
dfscaffolds