# Checking datasets for toxicophores and alerting structural motifs


In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\DT40_Cyto.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS2_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")

In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\NIH3T3.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS3_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")
print(f"üìÅ Results saved to: {output_csv_path}")

In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\PhoP.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS4_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")
print(f"üìÅ Results saved to: {output_csv_path}")

In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\Vero_76.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS5_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")
print(f"üìÅ Results saved to: {output_csv_path}")

In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\THP-1.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS6_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")
print(f"üìÅ Results saved to: {output_csv_path}")

In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\Vero_E6.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS7_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")
print(f"üìÅ Results saved to: {output_csv_path}")

In [None]:
import csv
import pandas as pd
from rdkit import Chem

# === File paths ===
smiles_csv_path = r'C:\PhD\Project\SA9-PAX8.csv'
output_csv_path = r'C:\PhD\Prompt_Engineering\Outputs2\DS8_Tox_Motif_Expanded_Individual.csv'

# === SMARTS Definitions ===

# Toxicophores
toxicophore_smarts = {
    'Amide': '[NX3][CX3](=O)[#6]',
    'Sulfonamide': '[#6][#6]S(=O)(=O)N',
    'Carboxylic_Acid': '[CX3](=O)[OX2H1]',
    'Nitro_Aliphatic': '[#7]N=O',
    'Nitro_Aromatic': 'c1ccc([N+](=O)[O-])cc1',
    'Azide': '[NX3][NX2]=[NX2]',
    'Acyl_Chloride': '[CX3](=O)Cl',
    'Anhydride': '[CX3](=O)OC(=O)[#6]',
    'Hydroxamic_Acid': '[CX3](=O)N[OH]',
    'Beta_Diketone': '[C;!R](=O)[C;!R](=O)',
    'Peroxynitrite': '[O-][N+](=O)O',
    'General_Nitro': '[$([N+](=O)[O-])]'
}

# Alerting Structural Motifs
alert_smarts = {
    'Aryl_Sulfonamide': '[#6][#6]S(=O)(=O)Nc1ccccc1',
    'Arylamide': 'CCc1ccc(cc1)C(=O)Nc2ccccc2',
    'Acetanilide': '[#6][#6]C(=O)Nc1ccccc1',
    'Primary_Secondary_Amine': '[NX3;H2,H1;!$(NC=O)]',
    'Alkyne': '[#6]=[#6]',
    'Benzoyl_Chloride': 'c1ccccc1C(=O)Cl',
    'Isothiocyanate': 'N=C=S',
    'Nitrile': '[C;H1,H2]#[N]',
    'Azide_Group': 'N=[N+]=[N-]',
    'Isocyanate': 'N=C=O',
    'Nitrobenzene': 'c1ccccc1N(=O)=O',
    'Alkene': 'C=C',
    'Phenol': '[O;H1]-c1ccccc1',
    'Enol': '[C;H2,H1]=[C;H2,H1]-[OH]',
    'Coumarin': 'c1ccc2c(c1)ccc(=O)o2'
}

# Compile SMARTS
compiled_tox = {name: Chem.MolFromSmarts(s) for name, s in toxicophore_smarts.items()}
compiled_alerts = {name: Chem.MolFromSmarts(s) for name, s in alert_smarts.items()}

# === Feature extraction ===
def extract_smarts_matches(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return "INVALID" for all SMARTS
        result = {name: 'INVALID_SMILES' for name in list(compiled_tox) + list(compiled_alerts)}
        result['Total_Toxicophores_Present'] = 'INVALID_SMILES'
        result['Total_Alerting_Motifs_Present'] = 'INVALID_SMILES'
        return result

    result = {}

    # Toxicophores
    tox_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_tox.values()]
    for name, match in zip(compiled_tox.keys(), tox_matches):
        result[name] = match
    result['Total_Toxicophores_Present'] = sum(tox_matches)

    # Alerts
    alert_matches = [int(mol.HasSubstructMatch(p)) for p in compiled_alerts.values()]
    for name, match in zip(compiled_alerts.keys(), alert_matches):
        result[name] = match
    result['Total_Alerting_Motifs_Present'] = sum(alert_matches)

    return result

# === Load SMILES ===
df = pd.read_csv(smiles_csv_path)
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].dropna().unique()

# === Process and collect results ===
results = []
for smiles in smiles_list:
    match_dict = extract_smarts_matches(smiles)
    row = {'SMILES': smiles, **match_dict}
    results.append(row)

# === Save to CSV ===
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv_path, index=False)

print(f"‚úÖ SMARTS analysis complete.")
print(f"üìÅ Results saved to: {output_csv_path}")