# Generating a Dataset with Descriptors and Morgan Fingerprints for Pre-Training #

## Import Necessary Libraries ##

In [66]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcNumAtomStereoCenters
from rdkit.Chem.rdchem import BondType
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect

In [11]:
def PRINT() -> None: print(f"{'-'*80}\nDone\n{'-'*80}")
def PRINTM(M) -> None: print(f"{'-'*80}\n{M}\n{'-'*80}")

## Loading the Dataset ##

In [4]:
import os
import pandas as pd

In [12]:
pt_dataset_path = os.path.join('datasets', 'final_dataset_2_0.75.csv')
pt_dataset = pd.read_csv(pt_dataset_path)

PRINTM('Loaded the dataset successfully')

--------------------------------------------------------------------------------
Loaded the dataset successfully
--------------------------------------------------------------------------------


In [13]:
pt_dataset.head()

Unnamed: 0,smiles,uniprot_id1,uniprot_id2,label
0,OC(=O)c1nc(sc1-c1ccc(cc1)-c1ccccc1)-c1ccc2CCCN...,P08514,P05106,0
1,CN1C(=O)N(c2cc(Cl)cc(Cl)c2)C(=O)[C@]12CN(c1nc3...,P04637,O15151,0
2,CC(C)NS(=O)(=O)c1ccc(OCC(=O)N2CCOCC2)cc1,P13612,P05556,0
3,CC(=O)N1CCN(C(=O)/C=C/c2ccc(Sc3ccccc3C(N)=O)c(...,P05362,P20701,1
4,CC(C)C1=C(SC2=N[C@](C)([C@H](N12)c1ccc(Cl)cc1)...,Q00987,P04637,1


## Generating Descriptors & Morgan Fingerprints ##

The next step after we've loaded the dataset is to generate for each molecule `SMILE` value descriptors and morgan fingerprints in order to train our pre train model with `chemprop`.

We will extract the descriptors and morgan finger prints usin `RDKit` bioinformatics open source.

In [34]:
initial_descriptor_names = [
    "NumAtomStereoCenters",
    "NumAminoBonds",
    "MolWt",
    "NumValenceElectrons",
    "TPSA",
    "MolLogP",
    "NumHeteroatoms",
    "NumRotatableBonds",
    "HeavyAtomCount",
    "FractionCSP3",
    "NumAtomStereoCenters",
    "NumAminoBonds"
]

In [37]:
def generate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        initial_descriptors = [
            AllChem.CalcNumAtomStereoCenters(mol),
            AllChem.CalcNumAmideBonds(mol),
            Descriptors.MolWt(mol),
            Descriptors.NumValenceElectrons(mol),
            Descriptors.TPSA(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHeteroatoms(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.HeavyAtomCount(mol),
            Descriptors.FractionCSP3(mol)
        ]
        
        all_descriptors = []
        descriptor_names = []
        for name, func in Descriptors.descList:
            all_descriptors.append(func(mol))
            descriptor_names.append(name)
        
        # Compute Morgan fingerprints of size=1024 and radius=2 in order to learn better representation of the molecules
        morgan_fp = GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        morgan_fp_list = list(morgan_fp)
        morgan_fp_names = [f"MorganFP_{i}" for i in range(1024)]
        
        # Ensure we have at least 200 descriptors by padding if necessary
        if len(all_descriptors) < 200:
            all_descriptors.extend([None] * (200 - len(all_descriptors)))
            descriptor_names.extend([None] * (200 - len(descriptor_names)))
        
        # Replace the first 10 descriptors with the initial ones
        all_descriptors[:10] = initial_descriptors
        descriptor_names[:10] = initial_descriptor_names
        
        # Combine descriptors and Morgan fingerprints
        combined_descriptors = all_descriptors[:200] + morgan_fp_list
        combined_names = descriptor_names[:200] + morgan_fp_names
        
        return [smiles] + combined_descriptors, ["SMILES"] + combined_names
    else:
        return [smiles] + [None] * (200 + 1024), ["SMILES"] + [None] * (200 + 1024)  # Return None for each descriptor if SMILES cannot be parsed

In [61]:
def generate_descriptors_df(df) -> pd.DataFrame:
    """
    Helper function that takes a list of molecule's SMILES values and generates a DataFrame
    with SMILES and 200 features, including specified descriptors.

    Params:
    - smiles_list (list of str): List of molecule's SMILES values as strings.

    Returns:
    - DataFrame: A DataFrame with the first column as SMILES and the other 200 columns as features.
    """
    data = []
    column_names = None
    for smiles in df['SMILES']:
        descriptors, names = generate_descriptors(smiles)
        data.append(descriptors)
        if column_names is None:
            column_names = names
    
    # Create DataFrame in the format of (SMILES, |<descriptors>| = 1224)
    df = pd.DataFrame(data, columns=column_names)
    
    return df

In [62]:
pt_dataset.head()

Unnamed: 0,smiles,uniprot_id1,uniprot_id2,label
0,OC(=O)c1nc(sc1-c1ccc(cc1)-c1ccccc1)-c1ccc2CCCN...,P08514,P05106,0
1,CN1C(=O)N(c2cc(Cl)cc(Cl)c2)C(=O)[C@]12CN(c1nc3...,P04637,O15151,0
2,CC(C)NS(=O)(=O)c1ccc(OCC(=O)N2CCOCC2)cc1,P13612,P05556,0
3,CC(=O)N1CCN(C(=O)/C=C/c2ccc(Sc3ccccc3C(N)=O)c(...,P05362,P20701,1
4,CC(C)C1=C(SC2=N[C@](C)([C@H](N12)c1ccc(Cl)cc1)...,Q00987,P04637,1


In [63]:
pt_dataset_smiles = pt_dataset[['smiles']].copy()
pt_dataset_smiles.rename(columns={'smiles': 'SMILES'}, inplace=True)

In [64]:
pt_dataset_smiles.head()

Unnamed: 0,SMILES
0,OC(=O)c1nc(sc1-c1ccc(cc1)-c1ccccc1)-c1ccc2CCCN...
1,CN1C(=O)N(c2cc(Cl)cc(Cl)c2)C(=O)[C@]12CN(c1nc3...
2,CC(C)NS(=O)(=O)c1ccc(OCC(=O)N2CCOCC2)cc1
3,CC(=O)N1CCN(C(=O)/C=C/c2ccc(Sc3ccccc3C(N)=O)c(...
4,CC(C)C1=C(SC2=N[C@](C)([C@H](N12)c1ccc(Cl)cc1)...


### Generate Descriptors and Morgan Fingerprints ###

In [65]:
pt_dataset_final = generate_descriptors_df(pt_dataset_smiles)
PRINTM('Generated new data frame with descriptors and morgan fingerprints successfully !')

[15:58:44] Explicit valence for atom # 31 N, 4, is greater than permitted
[16:01:11] Explicit valence for atom # 31 N, 4, is greater than permitted
[16:01:22] SMILES Parse Error: syntax error while parsing: na
[16:01:22] SMILES Parse Error: Failed parsing SMILES 'na' for input: 'na'
[16:04:14] SMILES Parse Error: syntax error while parsing: na
[16:04:14] SMILES Parse Error: Failed parsing SMILES 'na' for input: 'na'
[16:22:13] Explicit valence for atom # 31 N, 4, is greater than permitted
[16:22:25] SMILES Parse Error: syntax error while parsing: na
[16:22:25] SMILES Parse Error: Failed parsing SMILES 'na' for input: 'na'


--------------------------------------------------------------------------------
Generated new data frame with descriptors and morgan fingerprints successfully !
--------------------------------------------------------------------------------


In [67]:
pt_dataset_final

Unnamed: 0,SMILES,NumAtomStereoCenters,NumAminoBonds,MolWt,NumValenceElectrons,TPSA,MolLogP,NumHeteroatoms,NumRotatableBonds,HeavyAtomCount,...,MorganFP_1014,MorganFP_1015,MorganFP_1016,MorganFP_1017,MorganFP_1018,MorganFP_1019,MorganFP_1020,MorganFP_1021,MorganFP_1022,MorganFP_1023
0,OC(=O)c1nc(sc1-c1ccc(cc1)-c1ccccc1)-c1ccc2CCCN...,0.0,2.0,588.714,206.0,95.42,8.43670,9.0,5.0,42.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CN1C(=O)N(c2cc(Cl)cc(Cl)c2)C(=O)[C@]12CN(c1nc3...,2.0,3.0,592.464,200.0,117.84,5.61418,12.0,4.0,40.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,CC(C)NS(=O)(=O)c1ccc(OCC(=O)N2CCOCC2)cc1,0.0,1.0,342.417,128.0,84.94,0.61090,8.0,6.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CC(=O)N1CCN(C(=O)/C=C/c2ccc(Sc3ccccc3C(N)=O)c(...,0.0,3.0,454.508,166.0,126.85,2.54880,10.0,6.0,32.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(C)C1=C(SC2=N[C@](C)([C@H](N12)c1ccc(Cl)cc1)...,2.0,1.0,474.457,162.0,35.91,6.32410,7.0,4.0,31.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109243,Clc1cc(OCCN2CCOCC2)ccc1Nc1nc2c(-c3nnc[nH]3)ccc...,0.0,0.0,501.978,182.0,101.08,4.67610,10.0,7.0,36.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
109244,COC(=O)c1cc(O)cc(OC)c1Oc1cc(C)cc(O)c1C(=O)O,0.0,0.0,348.307,132.0,122.52,2.69192,8.0,5.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109245,CCCC1=CN(C[C@H](NC(=O)OCc2ccccc2)C(=O)O)C(=O)N...,1.0,1.0,587.681,226.0,154.47,3.78020,12.0,12.0,43.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
109246,O=C(CSc1nc2ccccc2c(=O)n1Cc1ccc2c(c1)OCO2)NCc1c...,0.0,1.0,449.488,162.0,95.59,3.17500,9.0,7.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
rows_with_null = pt_dataset_final[pt_dataset_final.isnull().any(axis=1)]

# Print the rows with null values
rows_with_null.shape

(243, 1225)

In [74]:
rows_with_null.head(30)

Unnamed: 0,SMILES,NumAtomStereoCenters,NumAminoBonds,MolWt,NumValenceElectrons,TPSA,MolLogP,NumHeteroatoms,NumRotatableBonds,HeavyAtomCount,...,MorganFP_1014,MorganFP_1015,MorganFP_1016,MorganFP_1017,MorganFP_1018,MorganFP_1019,MorganFP_1020,MorganFP_1021,MorganFP_1022,MorganFP_1023
179,CCN(CC)c1ccc2c(-c3ccc(S(=O)(=O)[O-])cc3S(=O)(=...,0.0,0.0,580.66,202.0,133.79,0.675,12.0,8.0,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
210,NC(=O)C(=O)[O-].[Na+],0.0,1.0,111.032,34.0,83.22,-5.7744,5.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
341,O=C(/C=C/c1c(C(=O)[O-])[nH]c2cc(Cl)cc(Cl)c12)N...,0.0,1.0,397.193,126.0,85.02,0.4941,8.0,4.0,26.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
834,[Na+].[Na+].[O-]c1c(Cl)cc(Cl)cc1Sc1cc(Cl)cc(Cl...,0.0,0.0,400.021,100.0,46.12,-1.3934,9.0,2.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
904,CC(=O)OCC1=C(C(=O)[O-])N2C(=O)[C@@H](NC(=O)CSc...,2.0,2.0,445.454,148.0,128.73,-3.8553,12.0,7.0,29.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2518,CCCCCCCCCCCCO[C@H]1O[C@H](COS(=O)(=O)[O-])[C@@...,25.0,0.0,2629.78,784.0,1155.18,-63.1634,106.0,57.0,148.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2724,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(COC...,2.0,2.0,477.456,160.0,176.34,-4.9532,15.0,7.0,31.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2887,S=C([S-])OC1CC2CC1C1CCCC21.[K+],5.0,0.0,266.472,78.0,9.23,-0.3365,4.0,1.0,15.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3147,C=CC1=C(C)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2)c(C)...,0.0,0.0,606.634,214.0,137.62,-1.15736,10.0,8.0,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3151,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(COC...,2.0,2.0,477.456,160.0,176.34,-4.9532,15.0,7.0,31.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [72]:
columns_with_null = pt_dataset_final.columns[pt_dataset_final.isnull().any()]

# Print the column names with null values
print(columns_with_null)

Index(['NumAtomStereoCenters', 'NumAminoBonds', 'MolWt', 'NumValenceElectrons',
       'TPSA', 'MolLogP', 'NumHeteroatoms', 'NumRotatableBonds',
       'HeavyAtomCount', 'FractionCSP3',
       ...
       'MorganFP_1014', 'MorganFP_1015', 'MorganFP_1016', 'MorganFP_1017',
       'MorganFP_1018', 'MorganFP_1019', 'MorganFP_1020', 'MorganFP_1021',
       'MorganFP_1022', 'MorganFP_1023'],
      dtype='object', length=1224)


In [76]:
len(columns_with_null)

1224