In [1]:
from rdkit import Chem, rdBase
from rdkit.Chem import Draw,rdDepictor
from rdkit.Chem import AllChem, Descriptors, DataStructs
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D
from rdkit.Chem.Scaffolds import rdScaffoldNetwork
from datetime import datetime
# import math
# import matplotlib as mpl
# from matplotlib import pyplot as plt
# import os
# import pyvis
# from pyvis.network import Network
# import inspect
import psycopg2
from collections import defaultdict

# DATABASE Connections

# Chemistry Database
db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
# create a cursor
cur = db_chem.cursor()
# execute a statement
cur.execute('SELECT version()')
db_ver = cur.fetchone()
print('PostgreSQL database version:', db_ver) 
#display the PostgreSQL database server version


print('RDKit version: ',rdBase.rdkitVersion)
# print('MatplotLib version:', mpl.__version__)
# print('Pyvis version:', pyvis.__version__)
print(datetime.now())

PostgreSQL database version: ('PostgreSQL 12.5 (Ubuntu 12.5-0ubuntu0.20.04.1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, 64-bit',)
RDKit version:  2020.03.3
2021-01-14 21:56:45.281309


In [19]:
import collections
import fingerprinter as fpt

# SELECT Structures without fingerprints

db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor() 
# sql = 'SELECT "Structure_ID", "SMILES" FROM public."Structures"' 
sql = 'SELECT s."Structure_ID", s."SMILES" FROM public."Structures" s \
WHERE s."Structure_ID" IN (SELECT f."Structure_ID" FROM public."Structures" f \
WHERE NOT EXISTS (SELECT 1 FROM public."Structure_Fingerprint" fp WHERE f."Structure_ID" = fp."Structure_ID"))' 
cur.execute(sql) 

mollist = cur.fetchall()
print('= = = Molecules without Fingerprints = = =')
print(len(mollist))
print(mollist)


# generate Morgan fingerprints for molecules

fp_key = {}
fp_val = []
fp_dict = collections.defaultdict(list)
fp_allmol = []
fp_smi_list = {}

for mol in mollist:
    print()
    print('Molecule : ',mol)
    # print(mol[0], mol[1])
    m = Chem.MolFromSmiles(mol[1])

    # Generate Fingerprint SMILES and SMILES list
    fp_smi = fpt.FingerprintToSmiles(m, 3)
    print('Fingerprint : ',fp_smi)    
    for f in fp_smi: 
        fp_smi_list[f[0]] = f[1]
             
    # Generate Fingerprint Footprints:
    big ={}
    fp = AllChem.GetMorganFingerprint(m,3, bitInfo=big)
    # print(len(big))
    # print(big)
    # print(fp)
    for fpb,v in big.items():
        ma = []
        fp_dict[fpb].append(len(v))
        # print(mol[0],fpb, len(v),v)
        ma.append(mol[0])
        ma.append(fpb)
        ma.append(len(v))
        # print(ma)
        fp_allmol.append(ma)
        
        # insert fingerprints for new molecules into structure_fingerprints

print()
print("All Fingerprints by molecule")
print(fp_allmol)
        
print()        
print(fp_smi_list)
print(len(fp_smi_list),'SMILES')

for fpb,v in fp_smi_list.items():
    
    print(fpb,' - ',v)
    
        # key = fbp
        #fp_dict[fpb].append(len(v))
        
# insert new SMILES into Fingerprint Table
        
print()        
print(fp_dict)
print(len(fp_dict),'fingerprints')

for fpb,v in fp_dict.items():
    print(fpb,' - ',len(v),' - ',v)
        # key = fbp
        #fp_dict[fpb].append(len(v))
        


= = = Molecules without Fingerprints = = =
15
[(10, 'CCO'), (11, 'CCN'), (13, 'CCC'), (14, 'CSC'), (15, 'CCSCC'), (16, 'CCCC'), (17, 'c1ccccc1'), (18, 'c1cnccc1'), (19, 'Cc1ccccc1'), (20, 'Cc1ccncc1'), (21, 'Cc1cccnc1'), (22, 'Cc1ccncc1Cl'), (23, 'Cc1c(N)cncc1Cl'), (24, 'CCOCC'), (34, 'CCNCC')]

Molecule :  (10, 'CCO')
Fingerprint :  [(864662311, 'O'), (1535166686, 'CO'), (2245384272, 'C'), (2246728737, 'C'), (3542456614, 'CC'), (4018048386, 'CCO')]

Molecule :  (11, 'CCN')
Fingerprint :  [(772817685, 'CCN'), (847957139, 'N'), (2245384272, 'C'), (2246728737, 'C'), (2592785365, 'CN'), (3542456614, 'CC')]

Molecule :  (13, 'CCC')
Fingerprint :  [(2068133184, 'CCC'), (2245384272, 'C'), (2246728737, 'C'), (3542456614, 'CC')]

Molecule :  (14, 'CSC')
Fingerprint :  [(1026928756, 'S'), (2246728737, 'C'), (2592252298, 'CS'), (4006898938, 'CSC')]

Molecule :  (15, 'CCSCC')
Fingerprint :  [(205323193, 'CSC'), (631714732, 'CCSC'), (1026928756, 'S'), (2245384272, 'C'), (2246728737, 'C'), (2547020

In [12]:
# This is part of netprepper
#    import pandas as pd
#    from scipy import stats as scistat

# import sys
# print(sys.path)
# set PYTHONPATH to proper directory set holds private modules and packages (here pkg_mod)
import netprepper

# SELECT fingerprint length and append to fp_dict

db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor() 
sql = 'SELECT sf."Structure_ID", sf."FP_ID", sf."FP_Count" FROM public."Structure_Fingerprint" sf \
JOIN public."Fingerprints" fp ON sf."FP_ID" = fp."Fingerprint_ID" WHERE fp."FP_Type" = 1'
cur.execute(sql)
fp_list = cur.fetchall()
print(fp_list)

n_fp = 0
print(len(fp_dict),'fingerprints')

for fpb, i in fp_dict.items():
    n_fp = max(n_fp, len(i))
    # print (fpb, i, n)

for fpb, i in fp_dict.items():
    ns = netprepper.shannon(i,n_fp)
    # print(fpb,' - ',netprepper.shannon(i,n))
    print(fpb,' - ',ns[0],' - ',ns[1])

# insert/update entropy table


[]
83 fingerprints
864662311  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
1535166686  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
2245384272  -  [1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0]  -  1.0579054247036734
2246728737  -  [1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2]  -  0.6901856760188042
3542456614  -  [1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0]  -  1.0123308391031736
4018048386  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
772817685  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
847957139  -  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.4293230219306162
2592785365  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
2068133184  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
1026928756  -  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.4293230219306162
2592252298  -  [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -  0.2711893730418441
4006898938  -  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [6]:
# Generate Fingerprint SMILES

from rdkit import Chem
from rdkit.Chem import AllChem

def FingerprintToSmiles(m, s):
    fp_sm = []
    bi = {}
    fp = AllChem.GetMorganFingerprint(m,s, bitInfo=bi)
    # print('FPSM : ', bi)
    for f in bi:
        # print('K:', f,' V:', bi[f])
        a = bi[f][0][0]
        r = bi[f][0][1]
        # print(f, a, r)
        if r > 0:
            env = Chem.FindAtomEnvironmentOfRadiusN(m,r,a)
            amap={}
            submol=Chem.PathToSubmol(m,env,atomMap=amap)
            sm =Chem.MolToSmiles(submol)
        else:
            am = m.GetAtomWithIdx(a)
            sm = am.GetSymbol()
            if am.GetIsAromatic():
                sm = sm.lower()
        fp_sm.append((f,sm))
        # print(f,' - ',sm)
        # print(f,' - ',len(v),' - ',v,' - ',a,' - ',r,' - ',sm)
    return fp_sm

In [None]:
# Insert novel structures after structure check

smi_in = ("CCOCC", "CCNCC")

db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor()
sql = 'INSERT INTO public."Structures"("SMILES") VALUES(%s);'
#print(sql)
for i in smi_in:   
    smi = (i,)
    # print(smi)
    try:
        cur.execute(sql, smi)
        print('Inserted:', i)
    except:
        print("Structure Exist: ", i)
        cur.close()
        db_chem.close()
        db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
        cur = db_chem.cursor()
        sql = 'INSERT INTO public."Structures"("SMILES") VALUES(%s);'




In [1]:
import chemistry_psql as chemps

smi_in = ("CCOCC", "CCNCC")

chemps.SmilesInsert(smi_in)

Structure Exist:  CCOCC
Inserted: CCNCC


In [14]:
# Insert new fingerprints into fingerprint table

db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor()
sql = 'SELECT * FROM public."Fingerprints" WHERE "FP_Type" = 1'
cur.execute(sql)
fpdblist = cur.fetchall()
print(len(fpdblist), "existing fingerprints")
print(fpdblist)
# print(fp_smi_list)

print()
print("New inserted fingerprints")

sql = 'INSERT INTO public."Fingerprints"("FP_Smiles", "FP_Number", "FP_Type") VALUES(%s, %s, 1);'

for fpb,v in fp_smi_list.items():
    smi_fl = 0
    for i in range(0,len(fpdblist)):
        if fpb == fpdblist[i][3]:
            smi_fl = 1

    if smi_fl == 0:
        cur.execute(sql, (v, fpb))
        db_chem.commit()
        print(fpb,' - ',v, smi_fl, "inserted")


83 existing fingerprints
[(5, 'CCN', None, 772817685, 1), (7, 'CO', None, 1535166686, 1), (8, 'C', None, 2245384272, 1), (9, 'C', None, 2246728737, 1), (10, 'O', None, 864662311, 1), (11, 'CC', None, 3542456614, 1), (12, 'CCO', None, 4018048386, 1), (13, 'N', None, 847957139, 1), (14, 'CN', None, 2592785365, 1), (15, 'CCC', None, 2068133184, 1), (16, 'S', None, 1026928756, 1), (17, 'CS', None, 2592252298, 1), (18, 'CSC', None, 4006898938, 1), (19, 'CSC', None, 205323193, 1), (20, 'CCSC', None, 631714732, 1), (21, 'CCSCC', None, 2547020769, 1), (22, 'CCS', None, 4095742117, 1), (23, 'CCC', None, 1173125914, 1), (24, 'CCCC', None, 1244535424, 1), (25, 'ccc', None, 98513984, 1), (26, 'ccccc', None, 2763854213, 1), (27, 'c', None, 3218693969, 1), (28, 'c1ccccc1', None, 3741631696, 1), (29, 'c1ccncc1', None, 755035130, 1), (30, 'ccccn', None, 1207774339, 1), (31, 'ccncc', None, 1343371647, 1), (32, 'cccnc', None, 1821698485, 1), (33, 'n', None, 2041434490, 1), (34, 'cnc', None, 3118255683, 

In [33]:
db_chem = psycopg2.connect(host = "localhost", dbname="Chemistry", user="postgres", password="postgres")
cur = db_chem.cursor()
sql = 'SELECT * FROM public."Fingerprints" WHERE "FP_Type" = 1'
cur.execute(sql)
fpdblist = cur.fetchall()
# print(fpdblist)

fpid = {}

for m in fpdblist:
    # print(m)
    i = m[0]
    j = m[3]
    # print(i,j)
    fpid[j]=i
    
print(fpid)

for m in fp_allmol:
    # j = m[1]
    i = fpid[m[1]]
    print(m,i)

{772817685: 5, 1535166686: 7, 2245384272: 8, 2246728737: 9, 864662311: 10, 3542456614: 11, 4018048386: 12, 847957139: 13, 2592785365: 14, 2068133184: 15, 1026928756: 16, 2592252298: 17, 4006898938: 18, 205323193: 19, 631714732: 20, 2547020769: 21, 4095742117: 22, 1173125914: 23, 1244535424: 24, 98513984: 25, 2763854213: 26, 3218693969: 27, 3741631696: 28, 755035130: 29, 1207774339: 30, 1343371647: 31, 1821698485: 32, 2041434490: 33, 3118255683: 34, 3776905034: 35, 297085749: 36, 422715066: 37, 742000539: 38, 908339072: 39, 951226070: 40, 3207567135: 41, 3217380708: 42, 3999906991: 43, 4244175903: 44, 225202425: 45, 614176945: 46, 1618426808: 47, 1717044408: 48, 72698074: 49, 1122673650: 50, 1153219603: 51, 1194681160: 52, 1349404210: 53, 2663076441: 54, 3777168895: 55, 281436469: 56, 1016841875: 57, 1237992904: 58, 1524398717: 59, 1888009671: 60, 2424973678: 61, 3124581743: 62, 3264841329: 63, 3452535345: 64, 3765881064: 65, 3886790172: 66, 265830265: 67, 745511444: 68, 1083852209: 69,