In [11]:
from rdkit import Chem
from rdkit.Chem import Fragments
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import pandas as pd

The descriptors should describe the molecuel as good as possible. Because we are looking for the Rf value of a compound in a certain solvent ration it seems rational to finde descriptors which are related to solubility and polarity of a molecuel. 
The overall polarity depends on the polarity of singel functional groups but also on the symmtery of a molecuel. As a start the number of some polarity influenzing functional groups seems like a good descriptor.

In [3]:

def polar_groups(Dataframe: pd.DataFrame):

    # isocyanates
    Dataframe['Isocyanates'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_isocyanate(Chem.MolFromSmiles(x)))
    # sulfones
    Dataframe['Sulfones'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_sulfone(Chem.MolFromSmiles(x)))
    # nitriles
    Dataframe['Nitriles'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_nitrile(Chem.MolFromSmiles(x)))
    # nitro groups
    Dataframe['Isocyanates'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_nitro(Chem.MolFromSmiles(x))) 
    # halogens
    Dataframe['Halogens'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_halogen(Chem.MolFromSmiles(x)))
    # amides
    Dataframe['Amides'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_amide(Chem.MolFromSmiles(x))) 

    return Dataframe

In [4]:
def low_polar_groups(Dataframe: pd.DataFrame):

    # hydroxyl groups
    Dataframe['Halogens'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_OH(Chem.MolFromSmiles(x)))
    # carboxylic acids
    Dataframe['Carboxylic_acids'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_COO2(Chem.MolFromSmiles(x)))
    # thiols
    Dataframe['Thiols'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_SH(Chem.MolFromSmiles(x)))
    # aldehydes
    Dataframe['Aldehydes'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_aldehyde(Chem.MolFromSmiles(x)))
    # ketones
    Dataframe['Ketones'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_ketone(Chem.MolFromSmiles(x)))
    # esters
    Dataframe['Esters'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_ester(Chem.MolFromSmiles(x)))
    # secondary amines
    Dataframe['Secondary_amines'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_NH2(Chem.MolFromSmiles(x))) 
    # primary amines
    Dataframe['Primary_amines'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_NH1(Chem.MolFromSmiles(x))) 

    return Dataframe

In [5]:
def unpolar_groups(Dataframe: pd.DataFrame):

    # unbranched alkanes
    Dataframe['Unbranched_alkanes'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_unbrch_alkane(Chem.MolFromSmiles(x)))
    # benzenes
    Dataframe['Benzenes'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_benzene(Chem.MolFromSmiles(x)))
    # ethers
    Dataframe['Ethers'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_ether(Chem.MolFromSmiles(x)))
    # tertiary amines
    Dataframe['Tertiary_amines'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_NH0(Chem.MolFromSmiles(x))) 
    # imines
    Dataframe['Imines'] = Dataframe['productSmiles'].apply(lambda x: Chem.Fragments.fr_imine(Chem.MolFromSmiles(x)))

    return Dataframe 


Moreover it is possible to use some predicted physical data like logP or the topological polar surface area (TPSA).

In [6]:
def physcial_desc(Dataƒrame: pd.DataFrame):

    # molecular weight
    Dataframe['Molecular_weight'] = Dataframe['productSmiles'].apply(lambda x: Chem.Descriptors.MolWt(Chem.MolFromSmiles(x)))
    # logP
    Dataframe['LogP'] = Dataframe['productSmiles'].apply(lambda x: Chem.Descriptors.MolLogP(Chem.MolFromSmiles(x)))
   # topological polar surface area
    Dataframe['TPSA'] = Dataframe['productSmiles'].apply(lambda x: Chem.Descriptors.TPSA(Chem.MolFromSmiles(x)))
    # topological polar surface area (inculde S and P)
    Dataframe['TPSA_including_S_and_P'] = Dataframe['productSmiles'].apply(lambda x: Chem.MolSurf.TPSA(Chem.MolFromSmiles(x, inculdeSandP=True)))

    return Dataframe

Some Solvents like water, Methanole, Ethanole, Carboxylic acids are able to develope H-Bonds with 
the compounds and improve theire solubility and change theire polarity. Therfore it would be intersting to know if the molecule is able to develop H-Bonds. We can test this by chekcing if it has any H-acceptors or H-donors.

In [8]:
def H_Bonds(Dataframe: pd.DataFrame):

    # number of hydrogen bond donors
    Dataframe['HBD'] = Dataframe['productSmiles'].apply(lambda x: Chem.Descriptors.NumHDonors(Chem.MolFromSmiles(x)))
    # number of hydrogen bond acceptors
    Dataframe['HBA'] = Dataframe['productSmiles'].apply(lambda x: Chem.Descriptors.NumHAcceptors(Chem.MolFromSmiles(x)))
    

    return Dataframe

Or we just use every descriptor rdkit offers:

In [19]:
df_solvents = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/Data/Solvents.csv')

In [28]:
df_solvents.head()

Unnamed: 0.1,Unnamed: 0,Solvent,Smiles,Elutropic_series,"Density(20°C, g/ml)","Solubility_water (20°C, mg/L)",log P,"Viscosity (20°C, mPa*s)",Polarity_index,ET30 (kcal/mol),ET30N,Dielectric_constant,Dipole_moment,Solvent_type,Mol,Mw (g/mol),log P (calc),H_Bond_Donors,H_Bond_Acceptors,Descriptors
0,0,n-Hexane,CCCCCC,0.0,0.672,9.5,3.764,0.31,0.1,31.0,0.009,1.88,0.0,nonpolar,<rdkit.Chem.rdchem.Mol object at 0x1248a0f20>,86.178,2.5866,0,0,86.178
1,1,n-Pentane,CCCCC,0.0,0.626,40.0,3.255,0.23,0.0,31.0,0.009,1.84,0.0,nonpolar,<rdkit.Chem.rdchem.Mol object at 0x1248a1070>,72.151,2.1965,0,0,72.151
2,2,Cyclohexane,C1CCCCC1,0.03,0.778,60.0,,1.0,0.2,30.09,0.006,2.02,0.0,nonpolar,<rdkit.Chem.rdchem.Mol object at 0x1248a10e0>,84.162,2.3406,0,0,84.162
3,3,Cyclopentane,C1CCCC1,0.04,0.751,156.0,,,0.1,,,1.87,0.0,nonpolar,<rdkit.Chem.rdchem.Mol object at 0x1248a1150>,70.135,1.9505,0,0,70.135
4,4,Toluene,Cc1ccccc1,0.22,0.865,520.0,2.73,0.59,2.4,33.9,0.099,2.38,0.36,nonpolar,<rdkit.Chem.rdchem.Mol object at 0x1248a11c0>,92.141,1.99502,0,0,92.141


In [38]:
def all_descriptors(Dataframe: pd.DataFrame):

   discriptors = {}

   for index, row in Dataframe.iterrows():

      discriptors.update(Chem.Descriptors.CalcMolDescriptors(Chem.MolFromSmiles(row['Smiles'])))
   
   df_descriptors = pd.DataFrame(discriptors)

   #Dataframe.concat(df_descriptors)
   
   return df_descriptors

In [39]:
all_descriptors(df_solvents)



ValueError: If using all scalar values, you must pass an index

In [41]:
def all_descriptors1(Dataframe: pd.DataFrame):
    descriptors = []
    
    for index, row in Dataframe.iterrows():
        mol = Chem.MolFromSmiles(row['Smiles'])
        if mol is not None:
            descriptor_values = Descriptors.CalcMolDescriptors(mol)
            descriptors.append(descriptor_values)
    
    df_descriptors = pd.DataFrame(descriptors, columns=Descriptors._descList)
    return df_descriptors

In [42]:
all_descriptors1(df_solvents)



Unnamed: 0,"(MaxAbsEStateIndex, <function MaxAbsEStateIndex at 0x11d080af0>)","(MaxEStateIndex, <function MaxEStateIndex at 0x11d0809d0>)","(MinAbsEStateIndex, <function MinAbsEStateIndex at 0x11d080b80>)","(MinEStateIndex, <function MinEStateIndex at 0x11d080a60>)","(qed, <function qed at 0x11d083880>)","(SPS, <function SPS at 0x11d083c70>)","(MolWt, <function <lambda> at 0x11d090310>)","(HeavyAtomMolWt, <function HeavyAtomMolWt at 0x11d0903a0>)","(ExactMolWt, <function <lambda> at 0x11d090430>)","(NumValenceElectrons, <function NumValenceElectrons at 0x11d0904c0>)",...,"(fr_sulfide, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f2d40>)","(fr_sulfonamd, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f2f80>)","(fr_sulfone, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f2ef0>)","(fr_term_acetylene, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f31c0>)","(fr_tetrazole, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f3880>)","(fr_thiazole, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f3400>)","(fr_thiocyan, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f2b90>)","(fr_thiophene, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f3370>)","(fr_unbrch_alkane, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f3ac0>)","(fr_urea, <function _LoadPatterns.<locals>.<lambda> at 0x10e9f3130>)"
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
