In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
from rdkit import Chem
import re
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import torch

In [2]:
import os
current_directory = os.getcwd()
data_path='/'+os.path.join(*current_directory.split('/')[:-2])+'/non_anndata_data'

In [3]:
D_smiles=np.load(f'{data_path}/D_smiles.npy', allow_pickle=True).item()

In [4]:
smiles_list=list(D_smiles.values())
# both these dmso's map to the same embedding vector
smiles_list.append('CS(C)=O')
smiles_list.append('CS(=O)C')
smiles_list=sorted(set(smiles_list))
if '' in smiles_list:
    smiles_list.remove('')
len(smiles_list)

19920

In [5]:
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
generator = MakeGenerator(("RDKit2D",))
for name, numpy_type in generator.GetColumns():
    print(f"{name}({numpy_type.__name__})")

RDKit2D_calculated(bool)
BalabanJ(float64)
BertzCT(float64)
Chi0(float64)
Chi0n(float64)
Chi0v(float64)
Chi1(float64)
Chi1n(float64)
Chi1v(float64)
Chi2n(float64)
Chi2v(float64)
Chi3n(float64)
Chi3v(float64)
Chi4n(float64)
Chi4v(float64)
EState_VSA1(float64)
EState_VSA10(float64)
EState_VSA11(float64)
EState_VSA2(float64)
EState_VSA3(float64)
EState_VSA4(float64)
EState_VSA5(float64)
EState_VSA6(float64)
EState_VSA7(float64)
EState_VSA8(float64)
EState_VSA9(float64)
ExactMolWt(float64)
FpDensityMorgan1(float64)
FpDensityMorgan2(float64)
FpDensityMorgan3(float64)
FractionCSP3(float64)
HallKierAlpha(float64)
HeavyAtomCount(float64)
HeavyAtomMolWt(float64)
Ipc(float64)
Kappa1(float64)
Kappa2(float64)
Kappa3(float64)
LabuteASA(float64)
MaxAbsEStateIndex(float64)
MaxAbsPartialCharge(float64)
MaxEStateIndex(float64)
MaxPartialCharge(float64)
MinAbsEStateIndex(float64)
MinAbsPartialCharge(float64)
MinEStateIndex(float64)
MinPartialCharge(float64)
MolLogP(float64)
MolMR(float64)
MolWt(float64)

In [6]:
n_jobs = 10
data = Parallel(n_jobs=n_jobs)(delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True) )

  0%|          | 0/19920 [00:00<?, ?it/s]

In [7]:
embedding = np.array(data)
embedding

array([[1.00000000e+00, 2.76025208e+00, 1.03587975e+02, ...,
        0.00000000e+00, 0.00000000e+00, 3.82968157e-01],
       [1.00000000e+00, 2.01735645e+00, 4.92479347e+02, ...,
        0.00000000e+00, 0.00000000e+00, 8.29231611e-01],
       [1.00000000e+00, 2.01663202e+00, 5.24366066e+02, ...,
        0.00000000e+00, 0.00000000e+00, 8.62580013e-01],
       ...,
       [1.00000000e+00, 3.12500000e+00, 6.01248181e+01, ...,
        0.00000000e+00, 0.00000000e+00, 4.46031488e-01],
       [1.00000000e+00, 1.73988952e+00, 8.79889662e+02, ...,
        0.00000000e+00, 0.00000000e+00, 5.07135544e-01],
       [1.00000000e+00, 2.24409478e+00, 5.20528596e+02, ...,
        0.00000000e+00, 0.00000000e+00, 6.57282831e-01]])

In [8]:
# See if it contains nans

In [9]:
drug_idx, feature_idx = np.where(np.isnan(embedding))
print(f'drug_idx:\n {drug_idx}')
print(f'feature_idx:\n {feature_idx}')

drug_idx:
 [ 2568  2568  2568  2568  3643  3643  3643  3643  3862  3862  3862  3862
 11020 11020 11020 11020 15710 15710 15710 15710 15711 15711 15711 15711
 15712 15712 15712 15712 15713 15713 15713 15713 15714 15714 15714 15714
 16326 16326 16326 16326 16659 16659 16659 16659 16904 16904 16904 16904
 19646 19646 19646 19646 19771 19771 19771 19771 19857 19857 19857 19857]
feature_idx:
 [40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46
 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46
 40 42 44 46 40 42 44 46 40 42 44 46]


In [10]:
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding))

drug_idx = np.concatenate((drug_idx, drug_idx_infs))
feature_idx = np.concatenate((feature_idx, feature_idx_infs))
np.array(generator.GetColumns())[np.unique(feature_idx)]

array([['MaxAbsPartialCharge', <class 'numpy.float64'>],
       ['MaxPartialCharge', <class 'numpy.float64'>],
       ['MinAbsPartialCharge', <class 'numpy.float64'>],
       ['MinPartialCharge', <class 'numpy.float64'>]], dtype=object)

In [11]:
embedding[drug_idx, feature_idx] 

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, inf, inf, inf, inf, inf,
       inf])

In [12]:
np.array(generator.GetColumns())[np.unique(feature_idx)]

array([['MaxAbsPartialCharge', <class 'numpy.float64'>],
       ['MaxPartialCharge', <class 'numpy.float64'>],
       ['MinAbsPartialCharge', <class 'numpy.float64'>],
       ['MinPartialCharge', <class 'numpy.float64'>]], dtype=object)

In [13]:
# if so, set them to 0

In [14]:
embedding[np.isnan(embedding)]=0

In [15]:
df = pd.DataFrame(data=embedding,index=smiles_list,columns=[f'latent_{i}' for i in range(embedding.shape[1])]) 
# First column is True, for if SMILES could be identified
df.drop(columns=['latent_0'], inplace=True)
df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_191,latent_192,latent_193,latent_194,latent_195,latent_196,latent_197,latent_198,latent_199,latent_200
BrC1C(Br)C(Br)C(Br)C(Br)C1Br,2.760252,103.587975,9.464102,5.731888,15.247868,5.464102,3.309307,8.803361,2.666558,9.010545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382968
BrCC(=O)NCCc1c[nH]c2ccccc12,2.017356,492.479347,11.380469,8.794284,10.380281,7.808862,5.186746,6.308215,3.593081,4.153815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.829232
BrCC(=O)NCCc1ccc2ccccc2c1,2.016632,524.366066,12.087576,9.448985,11.034982,8.292025,5.597430,6.718899,3.915087,4.475821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.862580
BrCC(=O)NCCc1ccccc1,2.241377,260.887073,9.518662,7.294284,8.880281,6.325699,4.192729,5.314198,2.719182,3.279917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.800349
Brc1c(Br)c(Br)c2[nH]nnc2c1Br,3.019418,436.174367,9.585422,5.906285,12.250271,6.125898,3.153143,6.325136,2.291339,5.463332,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.502785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c1ccc2c(c1)c3cccc4ccc5cccc2c5c43,2.329151,998.004970,13.104084,10.928203,10.928203,9.932653,6.976068,6.976068,5.418518,5.418518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243454
c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45,2.230840,1069.721887,13.104084,10.928203,10.928203,9.915816,6.970085,6.970085,5.453739,5.453739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243454
c1ccoc1,3.125000,60.124818,3.535534,2.717649,2.717649,2.500000,1.471405,1.471405,0.793148,0.793148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446031
c1cn(cn1)C(c1ccccc1)c1ccc(cc1)-c1ccccc1,1.739890,879.889662,16.192024,13.286732,13.286732,11.915816,8.094412,8.094412,5.811151,5.811151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.507136


In [16]:
normalized_df=(df-df.mean())/df.std()
normalized_df.head()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_191,latent_192,latent_193,latent_194,latent_195,latent_196,latent_197,latent_198,latent_199,latent_200
BrC1C(Br)C(Br)C(Br)C(Br)C1Br,2.444326,-2.573703,-2.077891,-2.383055,-0.713381,-2.336661,-2.392564,-0.832262,-2.21805,-0.074075,...,-0.497893,-0.070788,-0.037517,-0.063896,-0.153015,,-0.149764,-0.136016,-0.474671,-1.109687
BrCC(=O)NCCc1c[nH]c2ccccc12,0.733545,-1.500738,-1.787367,-1.82102,-1.608374,-1.794426,-1.811097,-1.586766,-1.871179,-1.783789,...,-0.497893,-0.070788,-0.037517,-0.063896,-0.153015,,-0.149764,-0.136016,-0.474671,1.394637
BrCC(=O)NCCc1ccc2ccccc2c1,0.731877,-1.412761,-1.680169,-1.700864,-1.487996,-1.682692,-1.683903,-1.46258,-1.750627,-1.670433,...,-0.497893,-0.070788,-0.037517,-0.063896,-0.153015,,-0.149764,-0.136016,-0.474671,1.58178
BrCC(=O)NCCc1ccccc1,1.249431,-2.139709,-2.06962,-2.096312,-1.884176,-2.137413,-2.118957,-1.887346,-2.198349,-2.091427,...,-0.497893,-0.070788,-0.037517,-0.063896,-0.153015,,-0.149764,-0.136016,-0.474671,1.232556
Brc1c(Br)c(Br)c2[nH]nnc2c1Br,3.041148,-1.656085,-2.059499,-2.351048,-1.264543,-2.183618,-2.44093,-1.58165,-2.358525,-1.3228,...,-0.497893,-0.070788,-0.037517,-0.063896,-0.153015,,-0.149764,-0.136016,-0.474671,-0.437305


In [17]:
# drop columns that are all nans
drop_cols=np.isnan(normalized_df).sum(0)[np.isnan(normalized_df).sum(0)>0].index
normalized_df=normalized_df[[a for a in normalized_df.columns if not a in drop_cols]].copy()

In [18]:
D_smiles_inverted={a:b for a,b in zip(D_smiles.values(), D_smiles.keys())}
D_smiles_inverted['CS(C)=O']='control'
D_smiles_inverted['CS(=O)C']='control'

In [19]:
def DataFrame_to_dict_of_tensors(normalized_df, D_smiles_inverted):
    smiless=list(normalized_df.index)
    SMILES_embedding={}
    for i in tqdm(range(len(normalized_df))):
        smiles=smiless[i]
        SMILES_embedding[D_smiles_inverted[smiles]]=torch.tensor(normalized_df.loc[smiles])
    return(SMILES_embedding)

In [20]:
SMILES_embedding=DataFrame_to_dict_of_tensors(normalized_df, D_smiles_inverted)

  0%|          | 0/19920 [00:00<?, ?it/s]

  SMILES_embedding[D_smiles_inverted[smiles]]=torch.tensor(normalized_df.loc[smiles])


In [21]:
import re
def remove_non_alphanumeric(input_string):
    return re.sub(r'[^a-zA-Z0-9]', '', input_string)
genes=list(SMILES_embedding.keys())
for gene in genes:
    SMILES_embedding[remove_non_alphanumeric(gene)]=SMILES_embedding[gene]

In [22]:
torch.save(SMILES_embedding, f'{data_path}/rdkit2D_embedding.pt')

In [23]:
# And for baseline set them all to zero
for k in SMILES_embedding.keys():
    SMILES_embedding[k]=abs(SMILES_embedding[k])*0

In [24]:
torch.save(SMILES_embedding, f'{data_path}/rdkit2D_embedding_all_zero.pt')