## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet
from cmapPy import pandasGEXpress as pdx

## Load Data

#### Load L1000 Data

In [None]:
gctoo = pdx.parse('Input/CD_signatures_LM_42809x978.gctx')
gctoo.data_df.head()

In [None]:
# Load file to map pert_id to pubchem_cid
pcid_pertid = pd.read_csv('Input/PCID_pertid_mapping.csv')
pcid_pertid.tail()

## Get smallest mean cosine dist for each drug

In [None]:
row_meta_df_g = gctoo.row_metadata_df.sort_values(['pert_id', 'mean_cosine_dist_centered_by_batch'])\
    .groupby('pert_id')\
    .head(1)
row_meta_df_g.tail()

## Make DF

In [None]:
df = pd.DataFrame()
for ind in row_meta_df_g.index:
    to_append = gctoo.data_df.loc[[ind]]
    df = df.append(to_append)
df.head()

In [None]:
# Put highest mean cosine signatures into df
for rid in df.index:
    df.loc[rid,'pert_id'] = row_meta_df_g.loc[rid,'pert_id']
df['pert_id'] = df['pert_id'].apply(lambda x: x.strip("b'"))

In [None]:
df.set_index('pert_id', inplace = True)

In [None]:
# Change name of drugs to PCID
pcid_pertid.set_index('pert_id', inplace = True)
for pertid in df.index:
    df.loc[pertid,'PCID'] = pcid_pertid.loc[pertid,'pubchem_cid']
df = df.dropna(subset = ["PCID"])
df.set_index('PCID', inplace = True)

In [None]:
#get rid of b' in from of gene sig name
columns = list(pd.Series(df.columns).apply(lambda x: x.strip("b'")))
df.columns = columns
df = df.T
df.index.name = 'Sig'

In [None]:
df.head()

## Save Matrix

In [None]:
filename = 'Output/L1000_signatures_pertid_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
df.to_csv(filename, sep='\t', compression='gzip')