In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import os
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
def extract_ligand_features(mol, label):
    try:
        mol_name = mol.GetProp('_Name') if mol.HasProp('_Name') else "Unknown"
        mol_weight = Descriptors.MolWt(mol)
        logp = Descriptors.MolLogP(mol)
        h_bond_donors = rdMolDescriptors.CalcNumHBD(mol)
        h_bond_acceptors = rdMolDescriptors.CalcNumHBA(mol)
        rot_bonds = Descriptors.NumRotatableBonds(mol)
        tpsa = rdMolDescriptors.CalcTPSA(mol)

        atom_counts = {}
        for atom in mol.GetAtoms():
            symbol = atom.GetSymbol()
            atom_counts[symbol] = atom_counts.get(symbol, 0) + 1

        features = {
            "Name": mol_name,
            "MW": mol_weight,
            "LogP": logp,
            "HBD": h_bond_donors,
            "HBA": h_bond_acceptors,
            "RB": rot_bonds,
            "TPSA": tpsa,
            "Label": label
        }

        for atom in ['C', 'H', 'N', 'O', 'F', 'Cl', 'Br', 'S', 'P']:
            features[f'Num_{atom}'] = atom_counts.get(atom, 0)

        return features
    except Exception as e:
        print(f"Invalid molecule: {e}")
        return None

In [12]:
def process_sdf_file(sdf_file, label):
    file = Chem.SDMolSupplier(sdf_file)
    data = []

    for mol in file:
        if mol is None:
            continue
        result = extract_ligand_features(mol, label)
        if result:
            data.append(result)

    return pd.DataFrame(data)

In [13]:
def plot(df):
    plt.figure(figsize=(6, 4))
    plt.scatter(df['MW'], df['LogP'], c=df['Label'], cmap='bwr', alpha=0.7)
    plt.xlabel("Molecular Weight")
    plt.ylabel("LogP")
    plt.title("MW vs LogP (Red: Drug-like, Blue: Decoy)")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("mw_vs_logp.png")
    plt.show()

    plt.figure(figsize=(6, 4))
    df.boxplot(column='TPSA', by='Label', grid=False)
    plt.title("TPSA by Label (0 = Decoy, 1 = Drug-like)")
    plt.suptitle("")
    plt.xlabel("Label")
    plt.ylabel("TPSA")
    plt.tight_layout()
    plt.savefig("tpsa_by_label.png")
    plt.show()

In [None]:
def main():
    # ✅ Update these paths
    drug_like = "D:/PROGRAMING/Python Scripting For Bioinformatics/Machine_learning/data/drug_like"
    non_drug_like = "D:/PROGRAMING/Python Scripting For Bioinformatics/Machine_learning/data/non_drug_like"

    # Read and process data
    drug_df = process_sdf_file(drug_like, label=1)
    decoy_df = process_sdf_file(non_drug_like, label=0)
    final_df = pd.concat([drug_df, decoy_df], ignore_index=True)
    final_df.to_csv("ligand_features_dataset.csv", index=False)

    print(f"\n✅ Dataset created with {len(final_df)} molecules")
    print(final_df.head())

    # Features & Labels
    X = final_df.drop(["Name", "Label"], axis=1)
    y = final_df["Label"]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Model Training
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation
    print("\n📊 Logistic Regression Results")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

    # Plots
    plot_mw_vs_logp(final_df)
    plot_tpsa_by_label(final_df)

# -------------------- Run --------------------
if __name__ == "__main__":
    main()


OSError: File error: Bad input file D:/PROGRAMING/Python Scripting For Bioinformatics/Machine_learning/data/drug_like.sdf