## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

In [None]:
df = pd.read_csv('Input/drugs_smiles_rings_scaffolds.csv')
df.head()

In [None]:
len(df['pert_id'].unique())

## Split up Scaffolds Column

In [None]:
how_many = 0
appended_df = []
scaf_index = np.where(df.columns.values=='scaffolds')[0][0]

for index, row in df.iterrows():
    scaf_group = row.loc['scaffolds']
    if ';' in str(scaf_group):
        scaf_split = scaf_group.split(';')
        for i in scaf_split:
            row_as_list = row.values.tolist()
            row_as_list[scaf_index] = i
            appended_df.append(row_as_list)
        df.drop(index, inplace = True)
        how_many += 1

print(len(appended_df))
print(how_many)

In [None]:
columnnames = list(df.columns.values)
fix_df = pd.DataFrame(appended_df,columns = columnnames)

In [None]:
fix_df.head()

In [None]:
df = df.append(fix_df)

In [None]:
df = df.dropna()
df.shape

## Make Binary Matrix

In [None]:
grouped_df = df.groupby(['scaffolds'])['pert_id'].apply(lambda x: ','.join(x.astype(str))).reset_index()
grouped_df.set_index('scaffolds', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['scaffolds'].unique())

In [None]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')
grouped_matrix.head()

In [None]:
grouped_matrix.shape

## Save Binary Matrix

In [None]:
filename = 'Output/L1000_Scaffolds_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Change Pert ID to PubChemID

In [None]:
pcid_pert_df = pd.read_csv('Input/PCID_pertid_mapping.csv')
pcid_pert_df.head()

In [None]:
df.head()

In [None]:
pcid_pert_df.set_index('pert_id', inplace = True)
df.set_index('pert_id', inplace = True)

In [None]:
for pertid in df.index:
    if pertid in pcid_pert_df.index:
        df.loc[pertid, 'pcid'] = pcid_pert_df.loc[pertid,'pubchem_cid']

In [None]:
df = df.dropna()

In [None]:
df.shape

## Make Binary Matrix for PCID version

In [None]:
grouped_df = df.groupby(['scaffolds'])['pcid'].apply(lambda x: ','.join(x.astype(str))).reset_index()
grouped_df.set_index('scaffolds', inplace=True)
grouped_df = grouped_df.sort_index()
grouped_df.head()

In [None]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')
grouped_matrix = grouped_matrix.T
grouped_matrix.head()

In [None]:
grouped_matrix.shape

In [None]:
filename = 'Output/L1000_Scaffolds_pcid_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')