In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
from rdkit import Chem
import re

In [7]:
def remove_non_alphanumeric(input_string):
    return re.sub(r'[^a-zA-Z0-9]', '', input_string)

In [13]:
import os
current_directory = os.getcwd()
data_path='/'+os.path.join(*current_directory.split('/')[:-2])+'/non_anndata_data'
anndata_path='/'+os.path.join(*current_directory.split('/')[:-2])+'/anndatas'

# Let us collect all drugs to see for which we can determine the smiles

## LINCS

In [10]:
df=pd.read_csv(f'{data_path}/GSE92742_Broad_LINCS_pert_info.txt', sep='\t')
# We only keep the drug-treated data
df=df[df['pert_type']=='trt_cp']
# for 63 drugs they do not have the smiles, I could look them up manually
df=df[df['canonical_smiles']!='-666'].copy()
# And some SMILES they do not want to publish
df=df[df['canonical_smiles']!='restricted'].copy()

D_smiles0={remove_non_alphanumeric(a):b for a,b in zip(df['pert_iname'], df['canonical_smiles'])}
D_smiles0

{'nifurtimox': 'CC1CS(=O)(=O)CCN1N=Cc1ccc(o1)[N+]([O-])=O',
 '5hydroxytryptophan': 'NC(Cc1c[nH]c2cccc(O)c12)C(O)=O',
 'hemado': 'CCCCC#Cc1nc(NC)c2ncn(C3OC(CO)C(O)C3O)c2n1',
 'SA3676': 'CCN1C2C(C(=NC2Nc3ccccc13)OC)c4ccccc4',
 'BRDA00474148': 'Oc1ccc(cc1)N1CCN(CC1)[S+]([O-])(=O)c1ccc2NC(=O)Cc2c1',
 'AFDX116': 'CCN(CC)CC1CCCCN1CC(=O)N1c2ccccc2C(=O)Nc2cccnc12',
 'biperiden': 'OC(CCN1CCCCC1)(C2C[C@H]3C[C@@H]2C=C3)c4ccccc4',
 'BRDA00626522': 'COc1ccc(cc1)C(CC(=O)N2CCCC(C)C2)c3c(O)cc(OC)cc3OC',
 'noretynodrel': 'C[C@]12CC[C@@H]3[C@@H](CCC4=C3CCC(=O)C4)[C@H]1CC[C@@]2(O)C#C',
 'BRDA00763758': 'COc1cc(ccc1O)C(O)C(C)N',
 'dyphylline': 'Cn1c2ncn(CC(O)CO)c2c(=O)n(C)c1=O',
 'alprenolol': 'CC(C)NCC(O)COc1ccccc1CC=C',
 'hexestrol': 'CCC(C(CC)c1ccc(O)cc1)c1ccc(O)cc1',
 'zebularine': 'OC[C@H]1O[C@H]([C@H](O)[C@@H]1O)n1cccnc1=O',
 '7hydroxyPIPAT': 'CCCN(C\\C=C\\I)C1CCc2ccc(O)cc2C1',
 '78dihydroLbiopterin': 'CC(O)[C@H](O)C1=Nc2c(NC1)nc(N)[nH]c2=O',
 'salmeterol': 'OCc1cc(ccc1O)C(O)CNCCCCCCOCCCCc1ccccc1',


## Sciplex

In [22]:
# I downloaded this data from https://drive.google.com/file/d/1_JUg631r_QfZhKl9NZXXzVefgCMPXE_9/view?usp=share_link
adata=sc.read(f'{anndata_path}/trapnell_final_V7.h5ad')
adata.obs['condition']=[a.split(' (')[0] for a in adata.obs['product_name']]
adata.obs['condition']=[a if a!='Vehicle' else 'control' for a in adata.obs['condition']]
adata.obs["condition"] = adata.obs["condition"].apply(remove_non_alphanumeric)
df=adata.obs.copy()

In [23]:
D_smiles1={remove_non_alphanumeric(a):b for a,b in zip(df['condition'], df['SMILES'])}

In [24]:
D_smiles=D_smiles0|D_smiles1

# Checking that SMILES are valid according to rdkit

In [25]:
def check_smiles(smiles):
    m = Chem.MolFromSmiles(smiles,sanitize=False)
    if m is None:
        #print('invalid SMILES')
        return False
    else:
        try:
            Chem.SanitizeMol(m)
        except:
            #print('invalid chemistry')
            return False
    return True

In [26]:
for k in D_smiles.values():
    if not check_smiles(k):
        print(k)
        break

In [27]:
# All seem fine

In [28]:
np.save(f'{data_path}/D_smiles.npy', D_smiles)