## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import time
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from scipy import stats, integrate

In [None]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from rdkit import RDConfig
from rdkit.Chem import AllChem
from rdkit import Avalon
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.Fingerprints import ClusterMols
from rdkit.Chem.Fingerprints import FingerprintMols

## Load FPS Files

In [None]:
fps = []
files = []

for filename in os.listdir('RDKit_fps/'):
    fps.append(filename[:-24])
    df = pd.read_table('RDKit_fps/'+ filename)
    df[df.columns[0]] = df[df.columns[0]].astype(str)
    files.append(df.set_index(df.columns[0]))

fpsfiles_dict=dict(zip(fps, files)) 
del fpsfiles_dict['']
fps.remove('')

In [None]:
fps = sorted(fps)
fps[:2]

In [None]:
names = []
files = []

for filename in os.listdir('All_bmat/'):
    names.append(filename[:-12])
    df = pd.read_table('All_bmat/'+ filename)
    files.append(df.set_index(df.columns[0]))

namefiles_dict=dict(zip(names, files))   
del namefiles_dict['']
names.remove('')

In [None]:
names

## Build Similarity Matrix

In [None]:
def pairwise_dis(df, metric):
    array_matrix = metrics.pairwise_distances(df, metric = metric)
    return array_matrix

def vector(array_matrix):
    return squareform(array_matrix)

In [None]:
fps_pheno_df = pd.DataFrame(0, index = names + fps, columns = names + fps)
fps_pheno_df.head(2)

In [None]:
for dbname1 in names:
    for dbname2 in names:
        if fps_pheno_df.loc[dbname1, dbname2] != 0:
            continue
        db1, db2 = namefiles_dict[dbname1].T, namefiles_dict[dbname2].T
        shared = sorted(list(set(db1.index) & set(db2.index)))
        db1 = db1.loc[shared]
        db2 = db2.loc[shared]
        db1_vect = vector(pairwise_dis(db1, 'manhattan'))
        db2_vect = vector(pairwise_dis(db2, 'manhattan'))
        p_coeff = pearsonr(db1_vect, db2_vect)[0]
        fps_pheno_df.loc[dbname1, dbname2] = p_coeff

In [None]:
for fps1 in fps:
    for fps2 in fps:
        if fps_pheno_df.loc[fps1, fps2] != 0:
            continue
        fpdb1, fpdb2 = fpsfiles_dict[fps1], fpsfiles_dict[fps2]
        shared = sorted(list(set(fpdb1.index) & set(fpdb2.index)))
        fpdb1 = fpdb1.loc[shared]
        fpdb2 = fpdb2.loc[shared]
        fpdb1_vect = vector(pairwise_dis(fpdb1, 'manhattan'))
        fpdb2_vect = vector(pairwise_dis(fpdb2, 'manhattan'))
        p_coeff = pearsonr(fpdb1_vect, fpdb2_vect)[0]
        fps_pheno_df.loc[fps1, fps2] = p_coeff
        fps_pheno_df.loc[fps2, fps1] = p_coeff

In [None]:
for dbname in names:
    for fp in fps:
        if fps_pheno_df.loc[dbname, fp] != 0:
            continue
        db= namefiles_dict[dbname].T
        fpdb = fpsfiles_dict[fp]
        shared = sorted(list(set(db.index) & set(fpdb.index)))
        db = db.loc[shared]
        fpdb = fpdb.loc[shared]
        db_vect = vector(pairwise_dis(db, 'manhattan'))
        fpdb_vect = vector(pairwise_dis(fpdb, 'manhattan'))
        p_coeff = pearsonr(db_vect, fpdb_vect)[0]
        fps_pheno_df.loc[dbname, fp] = p_coeff
        fps_pheno_df.loc[fp, dbname] = p_coeff

## Plots for full df

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(fps_pheno_df)
fig.colorbar(cax)

plt.xticks(range(len(names + fps)),names+fps, rotation = 90)
plt.yticks(range(len(names + fps)),names+fps)
plt.show()

In [None]:
sns.heatmap(fps_pheno_df)

In [None]:
sns.clustermap(fps_pheno_df)

## Plots for only cross (resource + fp) df

In [None]:
fps_pheno_cross_df = fps_pheno_df.loc['DrugBank_Targets':'Matador_Targets', 'AtomPair':'TopologicalTorsion']

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(fps_pheno_cross_df)
fig.colorbar(cax)

plt.xticks(range(len(fps)),fps, rotation = 90)
plt.yticks(range(len(names)),names)
plt.show()

plt.savefig('Presentation_figures/resources_fps_globalview.png', bbox_inches = 'tight')

In [None]:
sns.heatmap(fps_pheno_cross_df, cmap='vlag')

In [None]:
clustermap = sns.clustermap(fps_pheno_cross_df, cmap = 'vlag')
clustermap.savefig('Presentation_figures/Resources_fps_globalview_cluster.png')
clustermap

## Save DFs

In [None]:
fps_pheno_df.to_csv('Output/All_RDKFps_Phenotype.tsv', sep='\t')
fps_pheno_cross_df.to_csv('Output/All_RDKFps_Phenotype_cross.tsv', sep = '\t')

## Compare similarity of L1000 Data and Fingerprints

In [None]:
fps_L100_only = fps_pheno_df.loc[['L1000_sig_new','L1000_signatures'], 'AtomPair':'TopologicalTorsion']
fps_L1000_only = fps_pheno_df.loc['L1000_sig_new', 'AtomPair':'TopologicalTorsion']

In [None]:
clustermap = sns.clustermap(fps_L100_only, cmap = 'vlag')
clustermap.savefig('Presentation_figures/L1000_cluster.png')

## Determine indvidual p coefficients

In [None]:
# scaf_df = pd.read_table('Output/L1000_Scaffolds_pcid_2018_07.tsv')
# scaf_df.set_index('Unnamed: 0', inplace = True)
# scaf_df = scaf_df.T

db= namefiles_dict['L1000_sig_new'].T
# db= namefiles_dict['BindingDB_Targets'].T
# fpdb = fpsfiles_dict['RDKfps2']
fpdb = scaf_df
shared = sorted(list(set(db.index) & set(fpdb.index)))
db = db.loc[shared]
fpdb = fpdb.loc[shared]
db_vect = vector(pairwise_dis(db, 'manhattan'))
fpdb_vect = vector(pairwise_dis(fpdb, 'manhattan'))
p_coeff = pearsonr(db_vect, fpdb_vect)[0]