## 2019-03-28 Making Dataframes
Today has been spent turning the large .sd file into a
series of more manageable dataframes. 
The script /src/data/make_panda_dataframes.py
will do so and take about 20 minutes on my laptop.

In [3]:
import numpy as np
import sklearn
from rdkit import Chem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, IFrame
import gzip
import os
import pickle
import pandas as pd

In [4]:
def draw_molecule(molec, molsize):
    rdDepictor.Compute2DCoords(molec)
    drawer = rdMolDraw2D.MolDraw2DSVG(molsize[0], molsize[1])
    drawer.DrawMolecule(molec)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace("svg:", "")))


The data frames are now stored in /data/processed/*.pd.pkl as pickled
pandas datasets. Here we load one in to see what it looks like.

In [17]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)
pd.options.display.max_columns = None
display(df)


Unnamed: 0,BIOACT_PCHEMBL_VALUE,CMP_ACD_LOGD,CMP_ACD_LOGP,CMP_ALOGP,CMP_AROMATIC_RINGS,CMP_CHEMBL_ID,CMP_FULL_MWT,CMP_HBA,CMP_HBD,CMP_HEAVY_ATOMS,CMP_INORGANIC_FLAG,CMP_LOGP,CMP_MED_CHEM_FRIENDLY,CMP_MOLECULAR_SPECIES_ACID,CMP_MOLECULAR_SPECIES_BASE,CMP_MOLECULAR_SPECIES_NEUTRAL,CMP_MOLECULAR_SPECIES_ZWITTERION,CMP_MOLREGNO,CMP_NUM_ALERTS,CMP_NUM_RO5_VIOLATIONS,CMP_PSA,CMP_RO3_PASS,CMP_RTB,CMP_STANDARD_INCHI_KEY,CMP_STRUCTURE_TYPE,CMP_TYPE_PROTEIN,CMP_TYPE_SMALL_MOLECULE,DOC_YEAR,SMILES,TC_key,TGT_CHEMBL_ID,TGT_ORGANISM,TGT_TID
CHEMBL209 - CHEMBL19831,7.78,-2.86,0.58,-0.99,1,CHEMBL19831,568.58,11,5,39,True,-0.205,False,False,False,False,True,23108,4,2,231.95,False,14,OECPVFSZEWVKDZ-GOSISDBHSA-N,MOL,False,True,2002,COC(=O)c1ccc(F)cc1CS(=O)(=O)Nc1ccc(C)n(CC(=O)N...,CHEMBL209 - CHEMBL19831,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL332157,8.19,0.05,2.00,1.19,4,CHEMBL332157,462.48,7,4,33,True,1.595,True,False,False,False,True,202126,2,0,191.11,False,6,RDSVRCGNSLPORJ-UHFFFAOYSA-N,MOL,False,True,2003,N=C(N)c1cccc(-n2nnnc2C(=O)Nc2ccc(-c3ccccc3S(N)...,CHEMBL209 - CHEMBL332157,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL52427,6.89,1.72,3.72,4.75,4,CHEMBL52427,341.41,2,4,26,True,4.235,True,False,False,False,True,78590,2,0,85.89,False,3,ALBWLRSLXFJFTE-UHFFFAOYSA-N,MOL,False,True,2001,Cc1cc(-c2ccccc2)c(O)c(-c2cc3cc(C(=N)N)ccc3[nH]...,CHEMBL209 - CHEMBL52427,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL353213,5.17,1.74,1.94,0.50,3,CHEMBL353213,430.50,5,2,32,True,1.220,False,False,False,False,True,277230,0,0,91.1,False,8,YTPXTRPVTXEMKM-UHFFFAOYSA-N,MOL,False,True,2003,Cc1cnc(NCCc2ccccc2)c(=O)n1CC(=O)NCc1ccc2nccn2c1C,CHEMBL209 - CHEMBL353213,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL100672,7.39,1.62,1.62,2.11,0,CHEMBL100672,166.22,2,0,12,True,1.865,True,False,False,False,True,164206,2,0,26.3,True,2,WWRICQDSYZBXSH-HLTSFMKQSA-N,MOL,False,True,1998,C=CC[C@H]1C(=O)O[C@H]2CCC[C@@H]21,CHEMBL209 - CHEMBL100672,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL99622,9.92,2.24,2.24,3.07,1,CHEMBL99622,234.31,3,0,16,True,2.655,True,False,False,False,True,164053,1,0,51.6,False,2,LHVRINJTNXVOIY-SDDRHHMPSA-N,MOL,False,True,1998,O=C1O[C@H]2CCC[C@@H]2[C@H]1Sc1ccccc1,CHEMBL209 - CHEMBL99622,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL256892,8.09,-1.40,0.60,-0.29,2,CHEMBL256892,446.52,6,5,31,True,0.155,True,False,False,False,True,434115,4,0,189.73,False,9,VXDAVYUFYPFGDX-SNPRPXQTSA-N,MOL,False,True,2004,CC(=O)N1C[C@H](O)C[C@H]1C(=O)N[C@@H](CCCNC(=N)...,CHEMBL209 - CHEMBL256892,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL101041,9.30,1.15,1.15,2.41,0,CHEMBL101041,222.28,3,0,16,True,1.780,False,False,False,False,True,163967,2,0,43.37,True,3,FRPFDONKAHIRLY-WOPDTQHZSA-N,MOL,False,True,1998,CC(C)=CC(=O)C[C@@H]1C(=O)O[C@H]2CCC[C@@H]21,CHEMBL209 - CHEMBL101041,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL168411,4.73,2.43,2.43,2.99,4,CHEMBL168411,619.71,6,3,46,True,2.710,True,False,False,False,True,281497,3,1,154.47,False,10,AWCMDTMSPRXSSZ-RWPDHJIBSA-N,MOL,False,True,1999,Cn1c(C(=O)[C@H](Cc2cccc(C(=N)N)c2)NC(=O)[C@@H]...,CHEMBL209 - CHEMBL168411,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL47207,6.77,-1.51,-0.51,3.50,3,CHEMBL47207,445.53,4,3,33,True,1.495,True,False,False,False,True,67106,4,0,109.15,False,9,GJEPQTOOGGNJLG-UZUQRXQVSA-O,MOL,False,True,2002,COC(=O)[C@H](Cc1cccc(C(=N)N)c1)[C@@H](C)NC(=O)...,CHEMBL209 - CHEMBL47207,CHEMBL209,Homo sapiens,42


In [18]:
df.dtypes

BIOACT_PCHEMBL_VALUE                float64
CMP_ACD_LOGD                        float64
CMP_ACD_LOGP                        float64
CMP_ALOGP                           float64
CMP_AROMATIC_RINGS                    int64
CMP_CHEMBL_ID                        object
CMP_FULL_MWT                        float64
CMP_HBA                               int64
CMP_HBD                               int64
CMP_HEAVY_ATOMS                       int64
CMP_INORGANIC_FLAG                     bool
CMP_LOGP                            float64
CMP_MED_CHEM_FRIENDLY                object
CMP_MOLECULAR_SPECIES_ACID             bool
CMP_MOLECULAR_SPECIES_BASE             bool
CMP_MOLECULAR_SPECIES_NEUTRAL          bool
CMP_MOLECULAR_SPECIES_ZWITTERION       bool
CMP_MOLREGNO                         object
CMP_NUM_ALERTS                        int64
CMP_NUM_RO5_VIOLATIONS                int64
CMP_PSA                              object
CMP_RO3_PASS                         object
CMP_RTB                         

Now how do we get useful information out of this? We have to select the numerical rows from the table.


In [None]:
numerical_cols = ["BIOACT_PCHEMBL_VALUE", "CMP_ACD_LOGD", "CMP_ACD_LOGP", "CMP_ALOGP",
                  "CMP_AROMATIC_RINGS", "CMP_FULL_MWT", "CMP_HBA", "CMP_HBD", "CMP_LOGP",
                  "CMP_MOLECULAR_SPECIES_ACID", "CMP_MOLECULAR_SPECIES_BASE", "CMP_MOLECULAR_SPECIES_BASE",
                  "CMP_MOLECULAR_SPECIES_NEUTRAL", "CMP_MOLECULAR_SPECIES_ZWITTERION", "CMP_PSA"]

In [23]:
df["CMP_STRUCTURE_TYPE"] == "MOL"

CHEMBL209 - CHEMBL19831      True
CHEMBL209 - CHEMBL332157     True
CHEMBL209 - CHEMBL52427      True
CHEMBL209 - CHEMBL353213     True
CHEMBL209 - CHEMBL100672     True
CHEMBL209 - CHEMBL99622      True
CHEMBL209 - CHEMBL256892     True
CHEMBL209 - CHEMBL101041     True
CHEMBL209 - CHEMBL168411     True
CHEMBL209 - CHEMBL47207      True
CHEMBL209 - CHEMBL355060     True
CHEMBL209 - CHEMBL44586      True
CHEMBL209 - CHEMBL366921     True
CHEMBL209 - CHEMBL44113      True
CHEMBL209 - CHEMBL431292     True
CHEMBL209 - CHEMBL102081     True
CHEMBL209 - CHEMBL102115     True
CHEMBL209 - CHEMBL180105     True
CHEMBL209 - CHEMBL19453      True
CHEMBL209 - CHEMBL19666      True
CHEMBL209 - CHEMBL296046     True
CHEMBL209 - CHEMBL295454     True
CHEMBL209 - CHEMBL179270     True
CHEMBL209 - CHEMBL46618      True
CHEMBL209 - CHEMBL277695     True
CHEMBL209 - CHEMBL279199     True
CHEMBL209 - CHEMBL167550     True
CHEMBL209 - CHEMBL92251      True
CHEMBL209 - CHEMBL92883      True
CHEMBL209 - CH