In [1]:
### Step 1: Initiation ###

from rdkit import Chem, rdBase
from rdkit.Chem import Draw,rdDepictor
from rdkit.Chem import AllChem, Descriptors, DataStructs
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D
from rdkit.Chem.Scaffolds import rdScaffoldNetwork
from datetime import datetime
# import math
# import matplotlib as mpl
# from matplotlib import pyplot as plt
# import os
# import pyvis
# from pyvis.network import Network
# import inspect
import psycopg2
from collections import defaultdict

# DATABASE Connections

# Chemistry Database
db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
# create a cursor
cur = db_chem.cursor()
# execute a statement
cur.execute('SELECT version()')
db_ver = cur.fetchone()
print('PostgreSQL database version:', db_ver) 
#display the PostgreSQL database server version


print('RDKit version: ',rdBase.rdkitVersion)
# print('MatplotLib version:', mpl.__version__)
# print('Pyvis version:', pyvis.__version__)
print(datetime.now())

PostgreSQL database version: ('PostgreSQL 12.5 (Ubuntu 12.5-0ubuntu0.20.04.1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, 64-bit',)
RDKit version:  2020.03.3
2021-01-18 21:58:31.010420


In [4]:
### Step 2: Generate Fingerprints for New Structures ("no fingerprints") ###

import collections
import fingerprinter as fpt

# SELECT Structures without fingerprints

# db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor() 
# sql = 'SELECT "Structure_ID", "SMILES" FROM public."Structures"' 
sql = 'SELECT s."Structure_ID", s."SMILES" FROM public."Structures" s \
WHERE s."Structure_ID" IN (SELECT f."Structure_ID" FROM public."Structures" f \
WHERE NOT EXISTS (SELECT 1 FROM public."Structure_Fingerprint" fp WHERE f."Structure_ID" = fp."Structure_ID"))' 
cur.execute(sql) 

mollist = cur.fetchall()
print('= = = Molecules without Fingerprints = = =')
print(len(mollist))
print(mollist)


# generate Morgan fingerprints for molecules

fp_key = {}
fp_val = []
fp_dict = collections.defaultdict(list)
fp_allmol = []
fp_smi_list = {}

for mol in mollist:
    print()
    print('Molecule : ',mol)
    # print(mol[0], mol[1])
    m = Chem.MolFromSmiles(mol[1])

    # Generate Fingerprint SMILES and SMILES list
    fp_smi = fpt.FingerprintToSmiles(m, 3)
    print('Fingerprint : ',fp_smi)    
    for f in fp_smi: 
        fp_smi_list[f[0]] = f[1]
             
    # Generate Fingerprint Footprints:
    big ={}
    fp = AllChem.GetMorganFingerprint(m,3, bitInfo=big)
    # print(len(big))
    # print(big)
    # print(fp)
    for fpb,v in big.items():
        ma = []
        fp_dict[fpb].append(len(v))
        # print(mol[0],fpb, len(v),v)
        ma.append(mol[0])
        ma.append(fpb)
        ma.append(len(v))
        # print(ma)
        fp_allmol.append(ma)
        
        # insert fingerprints for new molecules into structure_fingerprints

print()
print("All Fingerprints by molecule")
print(fp_allmol)
        
print()        
print(fp_smi_list)
print(len(fp_smi_list),'SMILES')

for fpb,v in fp_smi_list.items():
    
    print(fpb,' - ',v)
    
        # key = fbp
        #fp_dict[fpb].append(len(v))
        
# insert new SMILES into Fingerprint Table
        
print()        
print(fp_dict)
print(len(fp_dict),'fingerprints')

for fpb,v in fp_dict.items():
    print(fpb,' - ',len(v),' - ',v)
        # key = fbp
        #fp_dict[fpb].append(len(v))
        


= = = Molecules without Fingerprints = = =
4
[(35, 'CCSC'), (36, 'CCPCC'), (37, 'CCOP(=O)C'), (38, 'CCCNCC')]

Molecule :  (35, 'CCSC')
Fingerprint :  [(229197718, 'CSC'), (1026928756, 'S'), (2245384272, 'C'), (2246728737, 'C'), (2592252298, 'CS'), (3542456614, 'CC'), (3654915120, 'CCSC'), (4095742117, 'CCS')]

Molecule :  (36, 'CCPCC')
Fingerprint :  [(255169469, 'CPC'), (854250975, 'CCPCC'), (984453308, 'P'), (1430395551, 'CCP'), (2245384272, 'C'), (2246728737, 'C'), (3542456614, 'CC'), (3921171501, 'CCPC')]

Molecule :  (37, 'CCOP(=O)C')
Fingerprint :  [(670191639, 'CO[PH](C)=O'), (864674487, 'O'), (864942730, 'O'), (894089450, 'CCOP'), (922668176, 'COP'), (984193073, 'P'), (1979701002, 'CCO[PH](C)=O'), (2235923055, 'O=P'), (2245384272, 'C'), (2246728737, 'C'), (3542456614, 'CC'), (3627969527, 'C[PH](=O)O'), (3826145483, 'CP'), (3994088662, 'CCO')]

Molecule :  (38, 'CCCNCC')
Fingerprint :  [(264316921, 'CCCNCC'), (773607102, 'CCN'), (787069595, 'CCN'), (847961216, 'N'), (894436332,

In [12]:
### Step 5: Update Shannon Entropies ###


# This is part of netprepper
#    import pandas as pd
#    from scipy import stats as scistat

# import sys
# print(sys.path)
# set PYTHONPATH to proper directory set holds private modules and packages (here pkg_mod)
import netprepper

# SELECT fingerprint length and append to fp_dict

db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor() 
sql = 'SELECT sf."Structure_ID", sf."FP_ID", sf."FP_Count", fp."FP_Shannon" FROM public."Structure_Fingerprint" sf \
JOIN public."Fingerprints" fp ON sf."FP_ID" = fp."Fingerprint_ID" WHERE fp."FP_Type" = 1'
cur.execute(sql)
fp_list = cur.fetchall()
# print(fp_list)

fp_dict = collections.defaultdict(list)
fp_mol = collections.defaultdict(list)

for m in fp_list:
    print(m, m[1],m[2])
    fp_dict[m[1]].append(m[2])
    fp_mol[m[0]].append(m[1])

# n_fp = 0
print(len(fp_dict),'fingerprints')
print(len(fp_mol),'structures')
n_fp = len(fp_mol)

#for fpb, i in fp_dict.items():
#    n_fp = max(n_fp, len(i))
    # print (fpb, i, n)

sql = 'UPDATE public."Fingerprints" SET "FP_Shannon" = %s WHERE "Fingerprint_ID" = %s'
cur = db_chem.cursor()
for fpb, i in fp_dict.items():
    ns = netprepper.shannon(i,n_fp)
    # print(fpb,' - ',netprepper.shannon(i,n))
    print(fpb,' - ',ns[0],' - ',ns[1])
    cur.execute(sql, (ns[1], fpb))
db_chem.commit()

# insert/update entropy table


(10, 10, 1, 0.22371807606583377) 10 1
(10, 7, 1, 0.22371807606583377) 7 1
(10, 8, 1, 1.2540994652497848) 8 1
(10, 9, 1, 0.6774944044487072) 9 1
(10, 11, 1, 1.073383620336505) 11 1
(10, 12, 1, 0.22371807606583377) 12 1
(11, 5, 1, 0.22371807606583377) 5 1
(11, 13, 1, 0.362210557135449) 13 1
(11, 8, 1, 1.2540994652497848) 8 1
(11, 9, 1, 0.6774944044487072) 9 1
(11, 14, 1, 0.22371807606583377) 14 1
(11, 11, 1, 1.073383620336505) 11 1
(13, 15, 1, 0.22371807606583377) 15 1
(13, 8, 1, 1.2540994652497848) 8 1
(13, 9, 2, 0.6774944044487072) 9 2
(13, 11, 2, 1.073383620336505) 11 2
(14, 16, 1, 0.4659992569610426) 16 1
(14, 9, 2, 0.6774944044487072) 9 2
(14, 17, 2, 0.4437572842601484) 17 2
(14, 18, 1, 0.22371807606583377) 18 1
(15, 19, 1, 0.22371807606583377) 19 1
(15, 20, 2, 0.22371807606583377) 20 2
(15, 16, 1, 0.4659992569610426) 16 1
(15, 8, 2, 1.2540994652497848) 8 2
(15, 9, 2, 0.6774944044487072) 9 2
(15, 21, 1, 0.22371807606583377) 21 1
(15, 11, 2, 1.073383620336505) 11 2
(15, 22, 2, 0.4437

79  -  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.336495757583516
80  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.20619205063323187
81  -  [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.20619205063323187
82  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.20619205063323187
83  -  [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.40945861869508926
84  -  [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.40945861869508926
85  -  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.336495757583516
86  -  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.336495757583516
87  -  [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.40945861869508926
88  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.20619205063323187
89  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.20619205063323187
90  -  [1, 0, 0, 0, 0, 0, 0

In [6]:
# Generate Fingerprint SMILES

from rdkit import Chem
from rdkit.Chem import AllChem

def FingerprintToSmiles(m, s):
    fp_sm = []
    bi = {}
    fp = AllChem.GetMorganFingerprint(m,s, bitInfo=bi)
    # print('FPSM : ', bi)
    for f in bi:
        # print('K:', f,' V:', bi[f])
        a = bi[f][0][0]
        r = bi[f][0][1]
        # print(f, a, r)
        if r > 0:
            env = Chem.FindAtomEnvironmentOfRadiusN(m,r,a)
            amap={}
            submol=Chem.PathToSubmol(m,env,atomMap=amap)
            sm =Chem.MolToSmiles(submol)
        else:
            am = m.GetAtomWithIdx(a)
            sm = am.GetSymbol()
            if am.GetIsAromatic():
                sm = sm.lower()
        fp_sm.append((f,sm))
        # print(f,' - ',sm)
        # print(f,' - ',len(v),' - ',v,' - ',a,' - ',r,' - ',sm)
    return fp_sm

In [2]:
# Insert novel structures after structure check

smi_in = ("CCSCC", "CCSC", "CCPCC")

db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor()
sql = 'INSERT INTO public."Structures"("SMILES") VALUES(%s);'
#print(sql)
for i in smi_in:   
    smi = (i,)
    # print(smi)
    try:
        cur.execute(sql, smi)
        print('Inserted:', i)
    except:
        print("Structure Exist: ", i)
        cur.close()
        db_chem.close()
        db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
        cur = db_chem.cursor()
        sql = 'INSERT INTO public."Structures"("SMILES") VALUES(%s);'




Structure Exist:  CCSCC
Inserted: CCSC
Inserted: CCPCC


In [3]:
import chemistry_psql as chemps

smi_in = ("CCOP(=O)C", "CCCNCC")

chemps.SmilesInsert(smi_in)

Inserted: CCOP(=O)C
Inserted: CCCNCC


In [5]:
### Step 3: Insert new fingerprints into fingerprint table ###

# db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor()
sql = 'SELECT * FROM public."Fingerprints" WHERE "FP_Type" = 1'
cur.execute(sql)
fpdblist = cur.fetchall()
# print(len(fpdblist), "existing fingerprints")
# print(fpdblist)
# print(fp_smi_list)

print()
if len(fp_smi_list) == 0:
    print('No new fingerprint inserts')
else:
    print("New inserted fingerprints")
print()

sql = 'INSERT INTO public."Fingerprints"("FP_Smiles", "FP_Number", "FP_Type") VALUES(%s, %s, 1);'

for fpb,v in fp_smi_list.items():
    smi_fl = 0
    for i in range(0,len(fpdblist)):
        if fpb == fpdblist[i][3]:
            smi_fl = 1

    if smi_fl == 0:
        cur.execute(sql, (v, fpb))
        db_chem.commit()
        print(fpb,' - ',v, smi_fl, "inserted")



New inserted fingerprints

229197718  -  CSC 0 inserted
3654915120  -  CCSC 0 inserted
255169469  -  CPC 0 inserted
854250975  -  CCPCC 0 inserted
984453308  -  P 0 inserted
1430395551  -  CCP 0 inserted
3921171501  -  CCPC 0 inserted
670191639  -  CO[PH](C)=O 0 inserted
864942730  -  O 0 inserted
894089450  -  CCOP 0 inserted
922668176  -  COP 0 inserted
984193073  -  P 0 inserted
1979701002  -  CCO[PH](C)=O 0 inserted
2235923055  -  O=P 0 inserted
3627969527  -  C[PH](=O)O 0 inserted
3826145483  -  CP 0 inserted
264316921  -  CCCNCC 0 inserted
787069595  -  CCN 0 inserted
2363984686  -  CCCN 0 inserted
2648064603  -  CCNCC 0 inserted
2898623098  -  CCCNC 0 inserted


In [8]:
### Step 4: Insert Fingerprints for New Molecules ###


# db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor()
sql = 'SELECT * FROM public."Fingerprints" WHERE "FP_Type" = 1'
cur.execute(sql)
fpdblist = cur.fetchall()
# print(fpdblist)

fpid = {}

for m in fpdblist:
    # print(m)
    i = m[0]
    j = m[3]
    # print(i,j)
    fpid[j]=i
    
# print(fpid)

cur = db_chem.cursor()
sql = 'INSERT INTO public."Structure_Fingerprint"("Structure_ID","FP_ID","FP_Count") VALUES (%s,%s,%s);'

# Insert Notification
print()
if len(fp_allmol) == 0:
    print('No new structure/fingerprint inserts')
else:
    print('New structure/fingerprint links')
print()

for m in fp_allmol:
    # j = m[1]
    i = fpid[m[1]]
    cur.execute(sql, (m[0],i,m[2]))
    db_chem.commit()
    print(m,i)
    



New structure/fingerprint links

[35, 229197718, 1] 89
[35, 1026928756, 1] 16
[35, 2245384272, 1] 8
[35, 2246728737, 2] 9
[35, 2592252298, 1] 17
[35, 3542456614, 1] 11
[35, 3654915120, 1] 90
[35, 4095742117, 1] 22
[36, 255169469, 1] 91
[36, 854250975, 1] 92
[36, 984453308, 1] 93
[36, 1430395551, 2] 94
[36, 2245384272, 2] 8
[36, 2246728737, 2] 9
[36, 3542456614, 2] 11
[36, 3921171501, 2] 95
[37, 670191639, 1] 96
[37, 864674487, 1] 79
[37, 864942730, 1] 97
[37, 894089450, 1] 98
[37, 922668176, 1] 99
[37, 984193073, 1] 100
[37, 1979701002, 1] 101
[37, 2235923055, 1] 102
[37, 2245384272, 1] 8
[37, 2246728737, 2] 9
[37, 3542456614, 1] 11
[37, 3627969527, 1] 103
[37, 3826145483, 1] 104
[37, 3994088662, 1] 83
[38, 264316921, 1] 105
[38, 773607102, 1] 84
[38, 787069595, 1] 106
[38, 847961216, 1] 85
[38, 894436332, 1] 86
[38, 1173125914, 1] 23
[38, 2245384272, 3] 8
[38, 2246728737, 2] 9
[38, 2264550215, 1] 87
[38, 2363984686, 1] 107
[38, 2648064603, 1] 108
[38, 2898623098, 1] 109
[38, 35424566