### Scaffolds_Analysis


#### 1.SMILES to sdf files

In [None]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolTransforms
from rdkit.Geometry import Point3D
from tqdm.notebook import tqdm
from pathlib import Path

def align_mol_to_principal_axes(mol):
    try:
        conf = mol.GetConformer()
        if not conf.Is3D():
            return
            
        coords = conf.GetPositions()
        
        centroid = coords.mean(axis=0)
        coords -= centroid
        
        covariance_matrix = np.dot(coords.T, coords)
        evals, evecs = np.linalg.eigh(covariance_matrix)
        
        sort_indices = np.argsort(evals)[::-1]
        evecs = evecs[:, sort_indices]
        
        rotation_matrix = evecs.T
        
        T = np.identity(4)
        T[:3, 3] = -centroid
        
        R = np.identity(4)
        R[:3, :3] = rotation_matrix
        
        M = np.dot(R, T)
        rdMolTransforms.TransformConformer(conf, M)
        
        new_coords = conf.GetPositions()
        z_values = new_coords[:, 2]
        
        if (np.max(z_values) - np.min(z_values)) < 1e-3:
            for i in range(mol.GetNumAtoms()):
                pos = conf.GetAtomPosition(i)
                jitter = np.random.uniform(-0.001, 0.001)
                conf.SetAtomPosition(i, Point3D(pos.x, pos.y, pos.z + jitter))
                
    except Exception as e:
        pass

def create_mol_objects_from_file(input_file_path: str, output_folder_path: str):
    input_path = Path(input_file_path)
    output_folder = Path(output_folder_path)

    if not input_path.is_file():
        print(f"Error: Input file does not exist -> {input_path}")
        return

    output_folder.mkdir(parents=True, exist_ok=True)
    output_sdf_path = output_folder / "Test_3d.sdf"

    print(f"Input File: {input_path}")
    print(f"Output SDF: {output_sdf_path}")

    print("\nReading input file...")
    try:
        file_extension = input_path.suffix.lower()
        if file_extension == '.csv':
            df = pd.read_csv(input_path)
        elif file_extension == '.xlsx':
            df = pd.read_excel(input_path)
        elif file_extension == '.parquet':
            df = pd.read_parquet(input_path)
        else:
            print("Unsupported format")
            return
            
        col_match = [c for c in df.columns if c.lower() == 'smiles']
        if col_match:
            smiles_list = df[col_match[0]].tolist()
        else:
            print("SMILES column not found")
            return
            
        del df
    except Exception as e:
        print(f"Read failed: {e}")
        return

    print("\nStreaming processing and repairing planar molecules...")
    writer = Chem.SDWriter(str(output_sdf_path))
    
    success_count = 0
    fail_count = 0

    for smiles in tqdm(smiles_list, desc="Processing"):
        mol = None
        try:
            smiles_str = str(smiles)
            mol = Chem.MolFromSmiles(smiles_str)
            
            if mol is not None:
                mol = Chem.AddHs(mol)
                if AllChem.EmbedMolecule(mol, randomSeed=42, useRandomCoords=True) == 0:
                    AllChem.MMFFOptimizeMolecule(mol)
                    
                    align_mol_to_principal_axes(mol)
                    
                    mol.SetProp("_Name", smiles_str)
                    writer.write(mol)
                    success_count += 1
                else:
                    fail_count += 1
            else:
                fail_count += 1
        except:
            fail_count += 1
        
        if mol: del mol

    writer.close()
    print(f"\nProcessing complete!")
    print(f"Success: {success_count}")
    print(f"Failed: {fail_count}")
    print(f"File saved to: {output_sdf_path}")

input_file = r"C:\Users\Cenking\Documents\SwissTools\Autodock\L1900-Targetmol-Anti-diabetic Compound Library-690cpds_standardized.csv"
output_folder = r"C:\Users\Cenking\Documents\SwissTools\Autodock"
create_mol_objects_from_file(input_file, output_folder)

#### 2.Extract Scaffolds

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from tqdm.notebook import tqdm
from pathlib import Path

def extract_murcko_scaffolds_from_sdf(input_sdf_path: str, output_folder_path: str):
    input_path = Path(input_sdf_path)
    output_folder = Path(output_folder_path)

    if not input_path.is_file():
        print(f"Error: Input file does not exist -> {input_path}")
        return

    output_folder.mkdir(parents=True, exist_ok=True)
    
    output_xlsx_path = output_folder / "murcko_scaffolds.xlsx"

    print(f"Input File: {input_path}")
    print(f"Output Folder: {output_folder}")
    print(f"Excel file to be created: {output_xlsx_path}")

    try:
        supplier = Chem.SDMolSupplier(str(input_path))
        print("Successfully created SDF file supplier.")
    except Exception as e:
        print(f"Error opening SDF file: {e}")
        return

    print("\nExtracting Murcko scaffolds from molecule objects...")
    
    scaffold_list = []
    processed_count = 0
    fail_count = 0

    for mol in tqdm(supplier, desc="Extracting Scaffolds"):
        processed_count += 1
        if mol is None:
            fail_count += 1
            continue
        
        try:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            scaffold_smiles = Chem.MolToSmiles(scaffold)
            scaffold_list.append(scaffold_smiles)
            
        except Exception as e:
            fail_count += 1

    print("\nScaffold extraction complete!")
    print(f"Total molecules processed: {processed_count}")
    print(f"Successfully extracted scaffolds: {len(scaffold_list)}")
    print(f"Failed or invalid molecules: {fail_count}")

    if scaffold_list:
        print("\nSaving scaffold SMILES to Excel file...")
        df_scaffolds = pd.DataFrame(scaffold_list, columns=["Scaffold_SMILES"])
        
        df_scaffolds.to_excel(output_xlsx_path, index=False, engine='openpyxl')
        
        print(f"All scaffolds successfully saved to: {output_xlsx_path}")
    else:
        print("No valid scaffolds extracted, Excel file not created.")

input_file = r"C:\Users\Cenking\Documents\SwissTools\ScaffoldsAnalysis\molecules_3d.sdf"
output_folder = r"C:\Users\Cenking\Documents\SwissTools"

extract_murcko_scaffolds_from_sdf(input_sdf_path=input_file, output_folder_path=output_folder)

#### 3.Scaffold Frequency Analysis

In [None]:
import pandas as pd
from pathlib import Path

def analyze_and_count_scaffolds(input_xlsx_path: str, output_folder_path: str):
    input_path = Path(input_xlsx_path)
    output_folder = Path(output_folder_path)

    if not input_path.is_file():
        print(f"Error: Input file does not exist -> {input_path}")
        return

    output_folder.mkdir(parents=True, exist_ok=True)
    
    output_xlsx_path = output_folder / "scaffold_statistics.xlsx"

    print(f"Input File: {input_path}")
    print(f"Output Folder: {output_folder}")
    print(f"Statistics file to be created: {output_xlsx_path}")

    print("\nReading Excel file... (this may take some time depending on file size)")
    try:
        df = pd.read_excel(input_path)
        if "Scaffold_SMILES" not in df.columns:
            print(f"Error: Column 'Scaffold_SMILES' not found in input file.")
            return
        print(f"Successfully read {len(df)} scaffold data.")
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return

    print("\nFile read complete, performing efficient scaffold statistics...")
    
    scaffold_counts = df['Scaffold_SMILES'].value_counts()
    
    df_results = scaffold_counts.reset_index()
    
    df_results.columns = ['Scaffold_SMILES', 'Count']
    
    print("Statistics complete!")
    print(f"Found {len(df_results)} unique scaffolds.")

    if not df_results.empty:
        print("\nSaving statistics results to new Excel file...")
        
        df_results.to_excel(output_xlsx_path, index=False, engine='openpyxl')
        
        print(f"Statistical analysis results successfully saved to: {output_xlsx_path}")
    else:
        print("No data to analyze, no output file created.")

input_file = r"C:\Users\Cenking\Documents\SwissTools\murcko_scaffolds.xlsx"
output_folder = r"C:\Users\Cenking\Documents\SwissTools"

analyze_and_count_scaffolds(input_xlsx_path=input_file, output_folder_path=output_folder)