### Signature Similarity (L1000 Signatures 2021)
#### Matrix download link (35.57GB): https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/cd-coefficient/cp_coeff_mat.gctx
#### Drug metadata download link: https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/metadata/cp_siginfo_abr.txt
#### Drug aliases : https://s3.amazonaws.com/lincs-dcic/sigcom-lincs-metadata/LINCS_small_molecules.tsv

In [None]:
import os

import pandas as pd
import numpy as np
import h5py as h5
import scipy.spatial.distance as dist

import requests
from tqdm import tqdm
import time
import json

In [None]:
def matrix_slice(query_name):
    '''
    Get slice of complete signature matrix by drug name
    '''
    col_idx = [i for i, x in enumerate(pert_names) if x == query_name]
    values = f['0']['DATA']['0']['matrix'][col_idx,:]
    return pd.DataFrame(values.T, columns=[query_name]*len(col_idx), index=genes, dtype=float)

def similarity_matrix(df, metric, dtype=None, sparse=False):
    '''
    Creates a similarity matrix between the rows of the df based on
    the metric specified. The resulting matrix has both rows and columns labeled
    by the index of df.
    '''
    mat = dist.pdist(df.to_numpy(dtype=dtype), metric)
    mat = dist.squareform(mat)
    mat = 1 - mat

    similarity_df = pd.DataFrame(
        data=mat, index=df.index, columns=df.index)
    
    # Set diagonols to NaN
    np.fill_diagonal(similarity_df.values, np.nan)
    
    similarity_df.index.name = None
    similarity_df.columns.name = None
    return similarity_df

In [None]:
f = h5.File('input/cp_coeff_mat.gctx', 'r')
genes = [x.decode('UTF-8') for x in f['0']['META']['ROW']['id']]

drug_meta = pd.read_table('https://lincs-dcic.s3.amazonaws.com/LINCS-sigs-2021/gctx/metadata/cp_siginfo_abr.txt')

In [None]:
# Table of BRD-IDs mapped to aliases not included in the original metadata
drug_alias_table = pd.read_csv('https://s3.amazonaws.com/lincs-dcic/sigcom-lincs-metadata/LINCS_small_molecules.tsv',
                              sep = '\t',
                              index_col=0)

alias_lookup =  drug_alias_table[~(drug_alias_table['compound_aliases'] == '-')]\
                .to_dict()['compound_aliases']

In [None]:
# Map BRD-IDs without drug name to drug name (if applicable)
# These have to be in the order given by source metadata to match matrix indices when slicing
pert_names = [alias_lookup.get(x,x) for x in drug_meta['pert_name'].tolist()]

### Choosing consensus based on Characteristic Direction absolute correlation value

In [None]:
unique_drugs = list(set(pert_names))

In [None]:
sigs = []
correlation_data = []
for drug in tqdm(unique_drugs):
    drug_matrix = matrix_slice(drug)
    if len(drug_matrix.columns) > 1:
        correlation_score = 0.0
        for i,x in enumerate(drug_matrix.columns):
            current_signature = drug_matrix.iloc[:,i]
            current_score = np.absolute(current_signature.values).mean()
            if current_score > correlation_score:
                correlation_score = current_score
                consensus_signature = pd.DataFrame(current_signature)
        sigs.append(consensus_signature)
        correlation_data.append([drug,correlation_score,len(drug_matrix.columns)])
        
    else:
        sigs.append(drug_matrix)  

In [None]:
# Concatenate all consensus signature into one dataframe
consensus_mat = pd.concat(sigs,axis=1)

# Drop NaN columns
consensus_mat.dropna(axis = 1,inplace=True)

In [None]:
consensus_mat.head()

In [None]:
# Table of drugs and their average correlation across cell lines
correlation_table = pd.DataFrame(correlation_data,
            columns = ['pert_name','cd_correlation','num_sigs'])

correlation_table.sort_values(by=['cd_correlation'], ascending=False, inplace = True)

In [None]:
correlation_table.head()

In [None]:
correlation_table.to_csv('CD_correlation_table.tsv', sep = '\t', index = False)

Create and save drug-drug correlation matrix

In [None]:
sim_mat = similarity_matrix(consensus_mat.T, metric='cosine')

In [None]:
sim_mat.shape

In [None]:
# h5 file
f = h5.File('L1000_signature_similarity.h5', 'w')
dset = f.create_dataset("data/matrix", data=sim_mat, dtype=np.float32)
f.close()

string_dt = h5.special_dtype(vlen=str)
colids = np.array(sim_mat.columns, dtype=object)

f = h5.File('L1000_signature_similarity.h5', 'a')
f.create_dataset("meta/colid", data=colids, dtype=string_dt)
f.close()