In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator, MACCSkeys
import numpy as np

# Generate fingerprints

## Get fingerprints

input: SMILES

output: different fingerprints arrays

modification: the types of fingerprints

In [3]:
# Function to get fingerprints for different fingerprints
def get_all_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    
    # Generate fingerprints and convert directly to NumPy arrays
    fingerprints = {
        'Morgan (ECFP)': np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048), dtype=int),  # Morgan fingerprint (ECFP)
        'FCFP': np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048, useFeatures=True), dtype=int),  # FCFP
        'RDKit': np.array(rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048).GetFingerprint(mol), dtype=int),  # RDKit
        'Atom Pair': np.array(rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048).GetFingerprint(mol), dtype=int),  # Atom Pair
        'Topological Torsion': np.array(rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048).GetFingerprint(mol), dtype=int),  # Topological Torsion
        'MACCS': np.array(MACCSkeys.GenMACCSKeys(mol), dtype=int),  # MACCS is 167 bits long
        'ALL': np.hstack([
            np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048), dtype=int),
            np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048, useFeatures=True), dtype=int),
            np.array(rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048).GetFingerprint(mol), dtype=int),
            np.array(rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048).GetFingerprint(mol), dtype=int),
            np.array(rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048).GetFingerprint(mol), dtype=int),
            np.array(MACCSkeys.GenMACCSKeys(mol), dtype=int)
        ])
    }
    
    return fingerprints


## Generate fingerprint Matrix

input: input file containing molecular Names, SMILES, Categories

output: dataframe with specific type of fingerprints, molecular Names, Categories

function: get specific fingerprints using get_all_fingerprints(smiles) and make it able to perform PCA, like adding the name and categories for plotting

modification: Add 'Category' and 'Name' columns can change to add other columns


In [5]:
# Example function to extract one specific fingerprint (e.g., Morgan (ECFP)) from all SMILES in a DataFrame
def extract_specific_fingerprint(chembl_df, fingerprint_func, target_fingerprint, smiles_column='SMILES'):
    """
    Extract a specific fingerprint from the SMILES column in a DataFrame.

    Parameters:
    - chembl_df: DataFrame containing the 'SMILES', 'Name', and 'Category' columns.
    - fingerprint_func: Function that takes a SMILES string and returns a dictionary of fingerprints.
    - target_fingerprint: The name of the fingerprint to extract (e.g., 'Morgan (ECFP)').
    - smiles_column: The column name for SMILES strings in the DataFrame.

    Returns:
    - fingerprint_df: DataFrame containing the specified fingerprint along with 'Name' and 'Category'.
    """
    # Extract the SMILES column
    smiles_list = chembl_df[smiles_column].tolist()
    fingerprints = [fingerprint_func(smiles)[target_fingerprint] for smiles in smiles_list]
    
    fingerprint_df = pd.DataFrame(fingerprints)
    

    # Collect the specific fingerprint for all molecules
    # fingerprint_df['Category'] = chembl_df['Category'].values
    fingerprint_df['Name'] = chembl_df['Name'].values
    
    # Move the 'Name' column to the first position
    fingerprint_df = fingerprint_df[['Name'] + [col for col in fingerprint_df.columns if col != 'Name']]

    

    # Display the first few rows of the fingerprint DataFrame
    print(f"\n{target_fingerprint} Fingerprint DataFrame:")
    print(fingerprint_df.head())
    return fingerprint_df


# Run

## Example

In [6]:
chembl_example = pd.read_csv(
    "../data/ExperimentData/43_Psychedelic_drug_SMILES_Category.csv"
    # sep=",", header=0
)
# print(chembl_example.head())

### For all

In [7]:
fingerprint_df = extract_specific_fingerprint(chembl_example, get_all_fingerprints, 'MACCS')
fingerprint_df.to_csv('MACCS_fingerprints.csv', index=False)


MACCS Fingerprint DataFrame:
         Name  0  1  2  3  4  5  6  7  8  ...  157  158  159  160  161  162  \
0        5-HT  0  0  0  0  0  0  0  0  0  ...    1    1    1    0    1    1   
1  4-AcO-MALT  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1   
2   5-MeO-DMT  0  0  0  0  0  0  0  0  0  ...    1    1    0    1    1    1   
3  5-MeO-DALT  0  0  0  0  0  0  0  0  0  ...    1    1    0    1    1    1   
4  5-MeO-DiPT  0  0  0  0  0  0  0  0  0  ...    1    1    0    1    1    1   

   163  164  165  166  
0    1    1    1    0  
1    1    1    1    0  
2    1    1    1    0  
3    1    1    1    0  
4    1    1    1    0  

[5 rows x 168 columns]


### For different combinations

In [8]:
import itertools

# List of all fingerprints
fingerprint_all = ['Morgan (ECFP)', 'FCFP', 'RDKit', 'Atom Pair', 'Topological Torsion', 'MACCS']

# Generate all possible subsets as a list of lists
# all_subsets = [list(subset) for r in range(len(fingerprint_all) + 1) for subset in itertools.combinations(fingerprint_all, r)]

# # Output all subsets
# for subset in all_subsets:
#     print(subset)

# print(all_subsets)


In [9]:
all_subsets = [[],
    # ['Morgan (ECFP)', 'MACCS'],
    ['MACCS', 'RDKit'],
    ['RDKit'],
    ['Morgan (ECFP)'],
    
    # ['Atom Pair', 'Topological Torsion']
]
# fingerprint_all = ['Morgan (ECFP)', 'FCFP', 'RDKit', 'Atom Pair', 'Topological Torsion', 'MACCS']
explained_variance_dict = {}

for combinations in all_subsets[1:]:
    combination_name = ' + '.join(combinations)
    print(f"\nProcessing fingerprint combination: {combination_name}")
    
    combined_fingerprints = []
    for combination in combinations:
        fingerprint_df = extract_specific_fingerprint(chembl_example, get_all_fingerprints, combination)
        combined_fingerprints.append(fingerprint_df)
    
    # Combine fingerprints horizontally
    # combined_df = pd.concat(combined_fingerprints, axis=1)
    # combined_df['Name'] = chembl_example['Name']
    # combined_df['Category'] = chembl_example['Category']
    


Processing fingerprint combination: MACCS + RDKit

MACCS Fingerprint DataFrame:
         Name  0  1  2  3  4  5  6  7  8  ...  157  158  159  160  161  162  \
0        5-HT  0  0  0  0  0  0  0  0  0  ...    1    1    1    0    1    1   
1  4-AcO-MALT  0  0  0  0  0  0  0  0  0  ...    1    1    1    1    1    1   
2   5-MeO-DMT  0  0  0  0  0  0  0  0  0  ...    1    1    0    1    1    1   
3  5-MeO-DALT  0  0  0  0  0  0  0  0  0  ...    1    1    0    1    1    1   
4  5-MeO-DiPT  0  0  0  0  0  0  0  0  0  ...    1    1    0    1    1    1   

   163  164  165  166  
0    1    1    1    0  
1    1    1    1    0  
2    1    1    1    0  
3    1    1    1    0  
4    1    1    1    0  

[5 rows x 168 columns]

RDKit Fingerprint DataFrame:
         Name  0  1  2  3  4  5  6  7  8  ...  2038  2039  2040  2041  2042  \
0        5-HT  1  0  0  0  0  1  1  0  1  ...     0     0     0     1     0   
1  4-AcO-MALT  1  0  0  1  0  1  1  1  1  ...     1     0     0     1     0   
2   5-MeO