### This script creates a .h5 file containing the co-expression and co-occurrence correlation matrices for Drugshot

In [None]:
import os
import requests
import time

import pandas as pd
import numpy as np
import scipy.spatial.distance as dist
import h5py as h5

### Co-Expression

In [None]:
# Download L1000 compound-induced gene expression signatures
download_link = 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/SEP-L1000/LINCS_Gene_Experssion_signatures_CD.csv.gz'
df = pd.read_csv(download_link).set_index('InChI Key')

In [None]:
L1000FWD_URL = 'http://amp.pharm.mssm.edu/L1000FWD/'
L1000FWD_METADATA = 'L1000FWD/Drugs_metadata.csv'

def _download_metadata():
    """ Checks whether metadata file has been downloaded, and if not
        downloads it from the L1000FWD downloads page.
    """
    if not os.path.isfile(L1000FWD_METADATA):
        if not os.path.exists('L1000FWD'):
            os.mkdir('L1000FWD')
        response = requests.get('https://maayanlab.cloud/L1000FWD/download/Drugs_metadata.csv', stream=True)
        if response.status_code != 200:
            raise Exception('This should not happen')
        with open(L1000FWD_METADATA, 'wb') as outfile:
            for chunk in response.iter_content(chunk_size=1024):
                outfile.write(chunk)

def get_drug_names(keys):
    """ Given a list of drug InChI keys, converts them to a corresponding list of drug names.
    """
    _download_metadata()
    l1000meta_df = pd.read_csv(L1000FWD_METADATA, index_col=5)
    l1000meta_df.index = l1000meta_df.index.map(lambda s: s.replace('InChIKey=', '') if isinstance(s, str) else s)
    l1000meta_df = l1000meta_df.iloc[np.logical_not(l1000meta_df.index.duplicated())]

    return list(l1000meta_df['pert_iname'].reindex(keys))

In [None]:
# Map InChI Keys to drug names
df.index = get_drug_names(df.index)
df = df.reset_index().dropna().set_index('index')
df = df[~df.index.duplicated(keep='first')]

In [None]:
def similarity_matrix(df, metric, dtype=None, sparse=False):
    '''
    Creates a similarity matrix between the rows of the df based on
    the metric specified. The resulting matrix has both rows and columns labeled
    by the index of df.
    '''
    similarity_matrix = dist.pdist(df.to_numpy(dtype=dtype), metric)
    similarity_matrix = dist.squareform(similarity_matrix)
    similarity_matrix = 1 - similarity_matrix

    similarity_df = pd.DataFrame(
        data=similarity_matrix, index=df.index, columns=df.index)
    
    # Set diagonols to 0
    np.fill_diagonal(similarity_df.values, float("NaN"))
    

    similarity_df.index.name = None
    similarity_df.columns.name = None
    return similarity_df

In [None]:
coexpression = similarity_matrix(df, metric = 'cosine')

In [None]:
# h5 file
f = h5.File('L1000_coexpression.h5', 'w')
dset = f.create_dataset("data/matrix", data=coexpression, dtype=np.float32)
f.close()

string_dt = h5.special_dtype(vlen=str)
colids = np.array(coexpression.columns, dtype=object)

f = h5.File('L1000_coexpression.h5', 'a')
f.create_dataset("meta/colid", data=colids, dtype=string_dt)
f.close()

In [None]:
# npz format
data = coexpression.to_numpy(dtype=np.float32)
index = np.array(coexpression.index)
columns = np.array(coexpression.columns)
np.savez_compressed('L1000_similarity_matrix.npz', correlations=data, index=index)

### Co-Occurrence

The co-occurrence-matrix.tsv file is generated using Alex's cooccur.jar script

In [None]:
# Save decompressed drugrif file locally to be used in coocur.jar
import gzip
import urllib.request

path = "https://appyters.maayanlab.cloud/storage/DrugShot/DrugRIF.tsv.gz"

response = urllib.request.urlopen(path)
with open('drugrif.tsv', 'wb') as outfile:
    outfile.write(gzip.decompress(response.read()))

In [None]:
command = "java -jar cooccur.jar -f drugrif.tsv -e 0 -p 2 -t 12 -o drugrif_cooccur.tsv"
os.system(command)

In [None]:
cooccur = pd.read_csv("drugrif_cooccur.tsv", sep="\t")
cooccur.index = cooccur.iloc[:,0]
cooccur_clean = cooccur.iloc[:,1:]
cooccur_clean.index.name = None

In [None]:
DrugRIF = pd.read_csv('drugrif.tsv', sep = '\t')
# Preserve cases of each name in DrugRIF
lookup_dict = {}
for x in DrugRIF['name'].tolist():
    lookup_dict[x.upper()] = x

cooccur_matrix = cooccur_clean.rename(columns = lookup_dict, index = lookup_dict)
np.fill_diagonal(cooccur_matrix.values, 0)

In [None]:
f = h5.File('drugrif_cooccur.h5', 'w')
dset = f.create_dataset("data/matrix", data=cooccur_matrix, dtype=np.float32)
f.close()

string_dt = h5.special_dtype(vlen=str)
colids = np.array(cooccur_matrix.columns, dtype=object)

f = h5.File('drugrif_cooccur.h5', 'a')
f.create_dataset("meta/colid", data=colids, dtype=string_dt)
f.close()