### ECFP4 and ECFP6 Tanimoto Similarity Matrices for LINCS compounds
#### Drug metadata (includes SMILES) : https://s3.amazonaws.com/lincs-dcic/sigcom-lincs-metadata/LINCS_small_molecules.tsv

In [None]:
import pandas as pd
import h5py as h5
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [None]:
sigcom_lincs = pd.read_csv('https://s3.amazonaws.com/lincs-dcic/sigcom-lincs-metadata/LINCS_small_molecules.tsv',
                          sep = '\t',
                          index_col=0)

alias_lookup =  sigcom_lincs[~(sigcom_lincs['compound_aliases'] == '-')]\
                .to_dict()['compound_aliases']

In [None]:
sigcom_lincs['pert_name'] = sigcom_lincs['pert_name'].apply(lambda x: alias_lookup.get(x,x))
sigcom_lincs.drop_duplicates(subset=['pert_name'],inplace=True)

In [None]:
molecules = [Chem.MolFromSmiles(x) for x in sigcom_lincs['canonical_smiles']]
sigcom_lincs['molecule'] = molecules
sigcom_lincs.dropna(subset=['molecule'], inplace = True)

In [None]:
len(sigcom_lincs)

In [None]:
def ECFP(radius=4):
    index = sigcom_lincs['pert_name'].tolist()
    morgan = [AllChem.GetMorganFingerprintAsBitVect(mol, radius, 4096) for mol in sigcom_lincs['molecule'].tolist()]
    mat = []
    for fp in morgan:
        arr = DataStructs.BulkTanimotoSimilarity(fp,morgan)
        mat.append(arr)
        
    array = np.asarray(mat)
    np.fill_diagonal(array,np.nan)
        
    # h5 file
    f = h5.File(f'LINCS_chemicals_ECFP{radius}_similarity_matrix.h5', 'w')
    dset = f.create_dataset("data/matrix", data=array, dtype=np.float32)
    f.close()

    string_dt = h5.special_dtype(vlen=str)
    colids = np.array(index, dtype=object)

    f = h5.File(f'LINCS_chemicals_ECFP{radius}_similarity_matrix.h5', 'a')
    f.create_dataset("meta/colid", data=colids, dtype=string_dt)
    f.close()

### ECFP4

In [None]:
ECFP()

### ECFP6

In [None]:
ECFP(radius=6)