In [1]:
import os
import pandas as pd
import numpy as np
import scipy as sc
import random
import pickle 

import sklearn
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# suppress all warnings
import warnings
warnings.filterwarnings('ignore') 

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

In [3]:
# path to read
path_read = '../Flask/psql_db/Tables'
# path to save
path_save = '../Flask/pollutants_db_app/static/img'

#### FP bits space

In [33]:
# load fingerprint data
fingerprint_data = pd.read_csv(path_read + '/FingerprintsTable.csv')
print(fingerprint_data.index, "\n", fingerprint_data.columns)

Int64Index([  8680, 143127,  10870,  69848,  12993, 164514,  86240,  13129,
             67884,   6278,
            ...
             26042,  24193,  23963,  23964,  23989,  23990,  23992,  23993,
             23994,  23995],
           dtype='int64', length=1826) 
 Index(['FP1', 'FP2', 'FP3', 'FP4', 'FP5', 'FP6', 'FP7', 'FP8', 'FP9', 'FP10',
       ...
       'APC2D10_I_I', 'APC2D10_I_B', 'APC2D10_I_Si', 'APC2D10_I_X',
       'APC2D10_B_B', 'APC2D10_B_Si', 'APC2D10_B_X', 'APC2D10_Si_Si',
       'APC2D10_Si_X', 'APC2D10_X_X'],
      dtype='object', length=16092)


In [3]:
# read FPB object
filename = '../Preprocessed_data/fp_bits.obj'
filehandler = open(filename, 'rb') 
Fpb_obj = pickle.load(filehandler)
Fpb_obj

{'annotation': {'samples': Int64Index([   8680,  143127,   10870,   69848,   12993,  164514,   86240,
                13129,   67884,    6278,
              ...
                 1117,    1119,   24682, 4685067,   14778,   24841,   14781,
                11430,   26042,   24193],
             dtype='int64', length=1750),
  'features': Index(['FP1', 'FP2', 'FP3', 'FP4', 'FP5', 'FP6', 'FP7', 'FP8', 'FP9', 'FP10',
         ...
         'AD2D714', 'AD2D715', 'AD2D726', 'AD2D729', 'AD2D735', 'AD2D760',
         'AD2D765', 'AD2D766', 'AD2D770', 'AD2D780'],
        dtype='object', length=5218)},
 'tfidf': {'tfidf_transformer': TfidfTransformer(),
  'tfidf_data': <1750x5218 sparse matrix of type '<class 'numpy.float64'>'
  	with 467820 stored elements in Compressed Sparse Row format>},
 'lsa': {'lsa_data': array([[ 4.46150378e-01,  4.65599796e-01, -2.30471042e-01, ...,
           1.78108040e-02, -4.14571861e-03,  3.34084720e-02],
         [ 7.61418679e-01, -2.97931919e-01, -2.92913529e-01, ...,

In [4]:
# get fingerprints bits by subsetting preprocessed columns 
fingerprint_bits = fingerprint_data.loc[:,Fpb_obj['annotation']['features']]
print(fingerprint_bits.columns)

Index(['FP1', 'FP2', 'FP3', 'FP4', 'FP5', 'FP6', 'FP7', 'FP8', 'FP9', 'FP10',
       ...
       'AD2D714', 'AD2D715', 'AD2D726', 'AD2D729', 'AD2D735', 'AD2D760',
       'AD2D765', 'AD2D766', 'AD2D770', 'AD2D780'],
      dtype='object', length=5218)


In [5]:
# print idf values of fingeprints bits 
df_idf_fpb = pd.DataFrame(Fpb_obj['tfidf']['tfidf_transformer'].idf_, index=fingerprint_bits.columns, columns=["idf_weights"]) 
# sort ascending 
df_idf_fpb.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
GraphFP742,1.009756
FP742,1.009756
FP743,1.012066
GraphFP743,1.012066
SubFP307,1.013222
...,...
KRFP1449,7.774795
KRFP2870,7.774795
KRFP2877,7.774795
KRFP4498,7.774795


##### FP bits classes:
| Fingerprint | Class | Number | Code |
| --- | --- | --- | --- | 
| CDK fingerprint | Fingerprinter | 1024 | FP
| CDK extended fingerprint | ExtendedFingerprinter | 1024 | ExtFP      	
| CDK graph only fingerprint | GraphOnlyFingerprinter | 1024 | GraphFP
| Estate fingerprint | EStateFingerprinter | 79	| EStateFP    	
| MACCS fingerprint	| MACCSFingerprinter |166 |	MACCSFP
| Pubchem fingerprint | PubchemFingerprinter | 881 | PubchemFP
| Substructure fingerprint | SubstructureFingerprinter | 307 | SubFP
| Klekota-Roth fingerprint | KlekotaRothFingerprinter | 4860 | KRFP
| 2D atom pairs | AtomPairs2DFingerprinter | 780 | AD2D

In [6]:
# subset fingerprints bits classes
fpb_classes = ['FP', 'ExtFP', 'GraphFP', 'EStateFP', 'MACCSFP', 'PubchemFP', 'SubFP', 'KRFP', 'AD2D']

In [36]:
# get idf weights of fingerprints bits classes 
fpb_classes_df = pd.DataFrame()
for i in fpb_classes:
    df = df_idf_fpb[df_idf_fpb.index.str.startswith(i)]
    df.insert(0, 'Fingerprint', df.index)
    df.insert(2, 'Class', i)
    # sort weights
    df = df.sort_values(by=['idf_weights'])
    fpb_classes_df = pd.concat([fpb_classes_df, df])
fpb_classes_df

Unnamed: 0,Fingerprint,idf_weights,Class
FP742,FP742,1.009756,FP
FP743,FP743,1.012066,FP
FP972,FP972,1.035459,FP
FP638,FP638,1.143452,FP
FP921,FP921,1.159400,FP
...,...,...,...
AD2D458,AD2D458,7.774795,AD2D
AD2D454,AD2D454,7.774795,AD2D
AD2D261,AD2D261,7.774795,AD2D
AD2D500,AD2D500,7.774795,AD2D


In [75]:
# plot the fingerprints bits distribution
fig = px.line_polar(fpb_classes_df, r='idf_weights', theta='Fingerprint', 
                    width=450, height=400, line_close=False, color='Class', title="Fingerprints Bits - Weights Distribution (IDF)",
                    color_discrete_sequence=px.colors.sequential.Plasma_r, )
# configuration
config = {'modeBarButtonsToRemove': ['zoom'],'displaylogo': False}
fig.update_polars(angularaxis_showticklabels=False)
fig.update_layout(font_size=12, title_x=0.5, title_font_size=14)
# save figure
fig.write_html(path_save + '/fingerprints_bits_idf.html', config=config)
fig.show(config=config)

#### FP counts space

In [18]:
# read FPC object
filename = '../Preprocessed_data/fp_counts.obj'
filehandler = open(filename, 'rb') 
Fpc_obj = pickle.load(filehandler)
Fpc_obj

{'annotation': {'samples': Int64Index([   8680,  143127,   10870,   69848,   12993,  164514,   86240,
                13129,   67884,    6278,
              ...
                 1117,    1119,   24682, 4685067,   14778,   24841,   14781,
                11430,   26042,   24193],
             dtype='int64', length=1750),
  'features': Index(['SubFPC1', 'SubFPC2', 'SubFPC3', 'SubFPC4', 'SubFPC5', 'SubFPC6',
         'SubFPC7', 'SubFPC8', 'SubFPC9', 'SubFPC10',
         ...
         'APC2D10_C_X', 'APC2D10_N_N', 'APC2D10_O_O', 'APC2D10_O_F',
         'APC2D10_O_X', 'APC2D10_Cl_Cl', 'APC2D10_Cl_X', 'APC2D10_Br_Br',
         'APC2D10_Br_X', 'APC2D10_X_X'],
        dtype='object', length=1515)},
 'tfidf': {'tfidf_transformer': TfidfTransformer(),
  'tfidf_data': <1750x1515 sparse matrix of type '<class 'numpy.float64'>'
  	with 76585 stored elements in Compressed Sparse Row format>},
 'lsa': {'lsa_data': array([[ 8.12594735e-01,  3.60701844e-01, -1.29693415e-02, ...,
          -9.78689333e-0

In [19]:
# get fingerprints counts by subsetting preprocessed columns 
fingerprint_counts = fingerprint_data.loc[:, Fpc_obj['annotation']['features']]
fingerprint_counts.columns

Index(['SubFPC1', 'SubFPC2', 'SubFPC3', 'SubFPC4', 'SubFPC5', 'SubFPC6',
       'SubFPC7', 'SubFPC8', 'SubFPC9', 'SubFPC10',
       ...
       'APC2D10_C_X', 'APC2D10_N_N', 'APC2D10_O_O', 'APC2D10_O_F',
       'APC2D10_O_X', 'APC2D10_Cl_Cl', 'APC2D10_Cl_X', 'APC2D10_Br_Br',
       'APC2D10_Br_X', 'APC2D10_X_X'],
      dtype='object', length=1515)

In [20]:
# print idf values of fingeprints counts
df_idf_fpc = pd.DataFrame(Fpc_obj['tfidf']['tfidf_transformer'].idf_, index=fingerprint_counts.columns, columns=["idf_weights"]) 
# sort ascending 
df_idf_fpc.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
SubFPC307,1.013222
APC2D1_C_C,1.055782
APC2D2_C_C,1.071607
KRFPC2975,1.111662
SubFPC300,1.121287
...,...
KRFPC4258,7.774795
KRFPC4257,7.774795
KRFPC1759,7.774795
KRFPC4328,7.774795


##### FP counts classes:
| Fingerprint | Class | Number | Code |
| --- | --- | --- | --- | 
| Substructure fingerprint count | SubstructureFingerprintCount | 307 | SubFPC
| Klekota-Roth fingerprint count | KlekotaRothFingerprintCount | 4860 | KRFPC
| 2D atom pairs count | AtomPairs2DFingerprintCount | 780 | APC2D

In [21]:
# subset fingerprints bits classes
fpc_classes = ['SubFPC', 'KRFPC', 'APC2D']

In [37]:
# get idf weights of fingerprints bits classes 
fpc_classes_df = pd.DataFrame()
for i in fpc_classes:
    df = df_idf_fpc[df_idf_fpc.index.str.startswith(i)]
    df.insert(0, 'Fingerprint', df.index)
    df.insert(2, 'Class', i)
    # sort weights
    df = df.sort_values(by=['idf_weights'])
    fpc_classes_df = pd.concat([fpc_classes_df, df])
fpc_classes_df

Unnamed: 0,Fingerprint,idf_weights,Class
SubFPC307,SubFPC307,1.013222,SubFPC
SubFPC301,SubFPC301,1.121287,SubFPC
SubFPC300,SubFPC300,1.121287,SubFPC
SubFPC302,SubFPC302,1.333051,SubFPC
SubFPC1,SubFPC1,1.400622,SubFPC
...,...,...,...
APC2D6_Br_X,APC2D6_Br_X,7.774795,APC2D
APC2D6_Br_Br,APC2D6_Br_Br,7.774795,APC2D
APC2D4_O_F,APC2D4_O_F,7.774795,APC2D
APC2D7_O_Si,APC2D7_O_Si,7.774795,APC2D


In [79]:
# plot the fingerprints counts distribution
fig = px.line_polar(fpc_classes_df, r='idf_weights', theta='Fingerprint', 
                    width=425, height=400, line_close=False, color='Class', title="Fingerprints Counts - Weights Distribution (IDF)",
                    color_discrete_sequence=px.colors.sequential.Plasma_r, )
# configuration
config = {'modeBarButtonsToRemove': ['zoom'],'displaylogo': False}
fig.update_polars(angularaxis_showticklabels=False)
fig.update_layout(font_size=12, title_x=0.5, title_font_size=14)
# save figure
fig.write_html(path_save + '/fingerprints_counts_idf.html', config=config)
fig.show(config=config)

#### Molecular descriptors

In [4]:
# load molecular descriptors data
molecular_descriptors = pd.read_csv(path_read + '/MolecularDescriptorsTable.csv', low_memory=False).astype('float64')
print(molecular_descriptors.index, "\n", molecular_descriptors.columns, "\n")

Int64Index([  8680, 143127,  10870,  69848,  12993, 164514,  86240,  13129,
             67884,   6278,
            ...
             26042,  24193,  23963,  23964,  23989,  23990,  23992,  23993,
             23994,  23995],
           dtype='int64', length=1826) 
 Index(['nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom', 'nAromBond',
       'nAtom', 'nHeavyAtom', 'nH',
       ...
       'P1s', 'P2s', 'E1s', 'E2s', 'E3s', 'Ts', 'As', 'Vs', 'Ks', 'Ds'],
      dtype='object', length=1875) 



In [5]:
# read MD object
filename = '../Preprocessed_data/mol_desc.obj'
filehandler = open(filename, 'rb') 
Md_obj = pickle.load(filehandler)
Md_obj

{'annotation': {'samples': Int64Index([   8680,  143127,   10870,   69848,   12993,  164514,   86240,
                13129,   67884,    6278,
              ...
                 1117,    1119,   24682, 4685067,   14778,   24841,   14781,
                11430,   26042,   24193],
             dtype='int64', length=1750),
  'features': Index(['nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom', 'nAromBond',
         'nAtom', 'nHeavyAtom', 'nH',
         ...
         'P1s', 'P2s', 'E1s', 'E2s', 'E3s', 'Ts', 'As', 'Vs', 'Ks', 'Ds'],
        dtype='object', length=1675)},
 'pca': {'pca_data': array([[ -7.05043105,   4.41111069,  -8.69597704, ...,   1.06360682,
            1.51658494,   0.22010407],
         [  0.58941326,  -2.41669609, -11.55908192, ...,  -0.17072875,
           -0.07989102,  -0.16213433],
         [ -6.67228085,   3.56642181,  -8.64308698, ...,   0.13109836,
            1.18292608,   0.89302626],
         ...,
         [ -9.13388871,  -6.88906247,  15.71301314, ...,   

In [6]:
# get molecular descriptors by subsetting preprocessed columns and rows
molecular_desc = molecular_descriptors.loc[:, Md_obj['annotation']['features']]
molecular_desc = molecular_desc.loc[Md_obj['annotation']['samples'],:]
molecular_desc.columns

Index(['nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom', 'nAromBond',
       'nAtom', 'nHeavyAtom', 'nH',
       ...
       'P1s', 'P2s', 'E1s', 'E2s', 'E3s', 'Ts', 'As', 'Vs', 'Ks', 'Ds'],
      dtype='object', length=1675)

In [7]:
# load molecules table 
molecules = pd.read_csv(path_read + '/MoleculesTable.csv', encoding='latin1')
molecules.head()

Unnamed: 0,Chemical_Name,CID,CAS,InChIKey,Canonical_SMILES,Molecular_Formula,Atoms,Molecule_Type
0,(1-methylpropyl)benzene,8680,135-98-8,ZJMWRROPUADPEA-UHFFFAOYSA-N,CCC(C)C1=CC=CC=C1,C10H14,[C],organic
1,(2-methylbutyl)cyclohexane,143127,54105-77-0,DDQXBDLAGHZBMP-UHFFFAOYSA-N,CCC(C)CC1CCCCC1,C11H22,[C],organic
2,(2-methylpropyl)benzene,10870,538-93-2,KXUHSQYYJYAXGZ-UHFFFAOYSA-N,CC(C)CC1=CC=CC=C1,C10H14,[C],organic
3,"1,1,1-trichloroethane",6278,71-55-6,UOCLXMDMGBRAIB-UHFFFAOYSA-N,CC(Cl)(Cl)Cl,C2H3Cl3,"[C, Cl]",organic
4,"1,1,2,2-tetrachloroethane",6591,79-34-5,QPFMBZIOSGYJDE-UHFFFAOYSA-N,C(C(Cl)Cl)(Cl)Cl,C2H2Cl4,"[C, Cl]",organic


In [8]:
# take CID, Molecular Formula and Molecule Type to represent molecules in PCA plot
col = molecules[['CID', 'Molecular_Formula', 'Molecule_Type']][molecules.CID.isin(molecular_desc.index)]
col['CID'] = pd.Categorical(col.CID, ordered=True, categories=molecular_desc.index)
col.sort_values('CID', inplace=True)
col

Unnamed: 0,CID,Molecular_Formula,Molecule_Type
0,8680,C10H14,organic
1,143127,C11H22,organic
2,10870,C10H14,organic
1641,69848,C10H12,organic
1651,12993,C5H8O,organic
...,...,...,...
1024,24841,HI,inorganic
921,14781,Ca5HO13P3,inorganic
1476,11430,CK2O3,inorganic
1481,26042,O2Ti,inorganic


In [9]:
# set hover text 
cid_formula = [('CID: '+str(x[0]), " Formula: "+x[1]) for x in col.values]
cid_formula[0:10]

[('CID: 8680', ' Formula: C10H14'),
 ('CID: 143127', ' Formula: C11H22'),
 ('CID: 10870', ' Formula: C10H14'),
 ('CID: 69848', ' Formula: C10H12'),
 ('CID: 12993', ' Formula: C5H8O'),
 ('CID: 164514', ' Formula: C5H3F9O'),
 ('CID: 86240', ' Formula: C5H2F10'),
 ('CID: 13129', ' Formula: C2H2F4'),
 ('CID: 67884', ' Formula: C4H5F5'),
 ('CID: 6278', ' Formula: C2H3Cl3')]

In [47]:
# plot first 5 components of PCA 
n_components = 5 
labels = {str(i): f"PC {i+1}" for i in range(n_components)}
# set color
color = col.Molecule_Type
labels['color'] = 'Molecule type'

fig = px.scatter_matrix(
    Md_obj['pca']['pca_data'],
    width=480, height=430,
    dimensions=range(n_components),
    color=color,
    labels=labels,
    hover_name=cid_formula,   
    title="Molecular Descriptors - Explained Variance (PCA)"
)

# remove diagonal 
fig.update_traces(diagonal_visible=False)
fig.update_layout(font_size=12, title_x=0.5, title_font_size=14)

# remove x and y ticks
fig.update_layout({"xaxis"+str(i+1): dict(showticklabels=False) for i in range(n_components)})
fig.update_layout({"yaxis"+str(i+1): dict(showticklabels=False) for i in range(n_components)})
fig.update_layout(legend=dict(orientation="h", y=-0.2))

# cofiguration
config = {'modeBarButtonsToRemove': ['logo', 'zoom', 'pan',
          'select', 'lasso', 'autoscale'], 'displaylogo': False}

# save figure
fig.write_html(path_save + '/molecular_descriptors_PCA.html', config=config)
fig.show(config=config)

#### Quantum properties

In [48]:
# load quantum properties data
quantum_properties = pd.read_csv(path_read + '/QuantumPropertiesTable.csv')
quantum_properties = quantum_properties.set_index('Molecule_CID')
quantum_properties.index

Int64Index([528971,  13387,   7304,   7903, 181154,   8247,   5897,  79123,
              7245, 182210,
            ...
             26042,  24193,  23963,  23964,  23989,  23990,  23992,  23993,
             23994,  23995],
           dtype='int64', name='Molecule_CID', length=1826)

In [49]:
# read QP object
filename = '../Preprocessed_data/quantum_props.obj'
filehandler = open(filename, 'rb') 
Qp_obj = pickle.load(filehandler)
Qp_obj

{'annotation': {'samples': Int64Index([ 528971,   13387,    7304,    7903,  181154,    8247,    5897,
                79123,    7245,  182210,
              ...
                24682, 4685067,   14781,   14778,   25477,   24841,    6328,
                11430,   26042,   24193],
             dtype='int64', name='Molecule_CID', length=1750),
  'features': Index(['Gas_Phase_Energy', 'Final_Energy', 'HOMO', 'LUMO', 'Polarizability',
         'QM_Dipole', 'QM_Dipole_X', 'QM_Dipole_Y', 'QM_Dipole_Z',
         'Lowest_Frequency', 'Highest_Frequency', 'Second_Lowest_Frequency',
         'Num_Negative_Frequencies', 'Zero_Point_Energy', 'Entropy', 'Enthalpy',
         'Free_Energy', 'Internal_Energy', 'Heat_Capacity', 'ln_Q',
         'Total_Internal_Energy', 'Total_Enthalpy', 'Total_Free_Energy'],
        dtype='object')},
 'pca': {'pca_data': array([[ 6.83221008, -2.03569661, -2.53132431, ..., -0.50173662,
           0.04728384,  0.28558018],
         [-2.44563902,  0.07856449,  0.18794303, .

In [50]:
# get quantum properties by subsetting preprocessed columns and rows
quantum_props = quantum_properties.loc[:, Qp_obj['annotation']['features']]
quantum_props = quantum_props.loc[Qp_obj['annotation']['samples'],:]
quantum_props.columns

Index(['Gas_Phase_Energy', 'Final_Energy', 'HOMO', 'LUMO', 'Polarizability',
       'QM_Dipole', 'QM_Dipole_X', 'QM_Dipole_Y', 'QM_Dipole_Z',
       'Lowest_Frequency', 'Highest_Frequency', 'Second_Lowest_Frequency',
       'Num_Negative_Frequencies', 'Zero_Point_Energy', 'Entropy', 'Enthalpy',
       'Free_Energy', 'Internal_Energy', 'Heat_Capacity', 'ln_Q',
       'Total_Internal_Energy', 'Total_Enthalpy', 'Total_Free_Energy'],
      dtype='object')

In [51]:
# take CID, Molecular Formula and Molecule Type to represent molecules in PCA plot
col = molecules[['CID', 'Molecular_Formula', 'Molecule_Type']][molecules.CID.isin(quantum_props.index)]
col['CID'] = pd.Categorical(col.CID, ordered=True, categories=quantum_props.index)
col.sort_values('CID', inplace=True)
col

Unnamed: 0,CID,Molecular_Formula,Molecule_Type
1201,528971,C27H54,organic
76,13387,C5H9NO,organic
1623,7304,C5H7N,organic
1031,7903,C3H7NO2,organic
1197,181154,C23H46,organic
...,...,...,...
1024,24841,HI,inorganic
1321,6328,CH3I,organic
1476,11430,CK2O3,inorganic
1481,26042,O2Ti,inorganic


In [52]:
# set hover text 
cid_formula = [('CID: '+str(x[0]), " Formula: "+x[1]) for x in col.values]
cid_formula[0:10]

[('CID: 528971', ' Formula: C27H54'),
 ('CID: 13387', ' Formula: C5H9NO'),
 ('CID: 7304', ' Formula: C5H7N'),
 ('CID: 7903', ' Formula: C3H7NO2'),
 ('CID: 181154', ' Formula: C23H46'),
 ('CID: 8247', ' Formula: C8H10N2O'),
 ('CID: 5897', ' Formula: C15H13NO'),
 ('CID: 79123', ' Formula: C4H9NO3'),
 ('CID: 7245', ' Formula: C6H5ClO'),
 ('CID: 182210', ' Formula: C20H24O5')]

In [53]:
# plot first 5 components of PCA 
n_components = 5 
labels = {str(i): f"PC {i+1}" for i in range(n_components)}
# set color
color = col.Molecule_Type
labels['color'] = 'Molecule type'

fig = px.scatter_matrix(
    Qp_obj['pca']['pca_data'],
    width=480, height=430,
    dimensions=range(n_components),
    color=color,
    labels=labels,
    hover_name=cid_formula,   
    title="Quantum Properties - Explained Variance (PCA)"
)

# remove diagonal 
fig.update_traces(diagonal_visible=False)
fig.update_layout(font_size=12, title_x=0.5, title_font_size=14)

# remove x and y ticks
fig.update_layout({"xaxis"+str(i+1): dict(showticklabels=False) for i in range(n_components)})
fig.update_layout({"yaxis"+str(i+1): dict(showticklabels=False) for i in range(n_components)})
fig.update_layout(legend=dict(orientation="h", y=-0.2))

# cofiguration
config = {'modeBarButtonsToRemove': ['logo', 'zoom', 'pan',
          'select', 'lasso', 'autoscale'], 'displaylogo': False}

# save figure
fig.write_html(path_save + '/quantum_properties_PCA.html', config=config)
fig.show(config=config)