<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Descriptor%20Generators/GenerateMACCSMorganFingerprints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git

In [None]:
!pip install rdkit

In [None]:
%cd Average_Weighted_Path_Vector/Data\ Files

In [2]:
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit.Chem import rdFingerprintGenerator
from tqdm import tqdm
import numpy as np
import os
import pandas as pd

In [3]:
# Define radius and fpSize (fingerprint Size) for Morgan fingerprints
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=1024)

In [4]:
def mol_from_smile(sm : str) -> Chem.Mol:
      try:
        mol = Chem.MolFromSmiles(sm)  # Convert SMILES to RDKit Mol
        if mol is None:
            return None
        return mol
      except:
        return None

def mol_to_maccs(mol : Chem.Mol):
      # Compute MACCS fingerprint (166-bit)
      maccs_fp = GetMACCSKeysFingerprint(mol)
      return list(maccs_fp)


def mol_to_morgan(mol : Chem.Mol, radius=2, nBits=1024):
      # Generate a Morgan fingerprint (ECFP4, radius=2)
      morgan_fp = mfpgen.GetFingerprint(mol)
      return np.array(morgan_fp)


In [9]:
input_file = "Excel Files/Zang_Data.xlsx"
property = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]

output_dir = "Descriptors Data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
tqdm.pandas()
for prop in property:
    df = pd.read_excel(input_file, sheet_name=prop)
    df['Preferred_name'] = df['Preferred_name'].astype(str)

    print(f"\nProcessing SMILES from sheet:{prop} .... \n")

    # MACCS Keys
    print(f"Calculating MACCS Keys First!\n")
    df['mol'] = df['SMILES'].progress_apply(mol_from_smile)
    df['maccs_Desc'] = df['mol'].progress_apply(mol_to_maccs)

    desc_list = df['maccs_Desc'].tolist()
    Desc_df = pd.DataFrame(desc_list)
    Desc_df.columns = [f"{prop}_{i}" for i in range(Desc_df.shape[1])]
    output_file = os.path.join(output_dir, f"{prop}_MACCS.parquet")
    pd.concat([df.iloc[:, :9], Desc_df], axis=1).to_parquet(output_file, index=False)
    print(f"The result is saved to the file {output_file}.\n")

    # Morgan Keys
    print(f"Calculating Morgan fingerprints now!\n")
    df['morgan_Desc'] = df['mol'].progress_apply(mol_to_morgan)

    desc_list = df['morgan_Desc'].tolist()
    Desc_df = pd.DataFrame(desc_list)
    Desc_df.columns = [f"{prop}_{i}" for i in range(Desc_df.shape[1])]
    output_file = os.path.join(output_dir, f"{prop}_Morgan.parquet")
    pd.concat([df.iloc[:, :9], Desc_df], axis=1).to_parquet(output_file, index=False)
    print(f"The result is saved to the file {output_file}.\n")

print(f'All sheets have been processed successfully!')


## To inspect the values of some computed descriptors


In [None]:
desc_data = pd.read_parquet('Descriptors Data/LogP_Morgan.parquet')
desc_data.head()