## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import time
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from scipy import stats, integrate

In [None]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from rdkit import RDConfig
from rdkit.Chem import AllChem
from rdkit import Avalon
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.Fingerprints import ClusterMols
from rdkit.Chem.Fingerprints import FingerprintMols

In [None]:
import pybel, openbabel

## Load Data

In [None]:
df = pd.read_table('Output/PubChemID_SMILES_InchI_2018_07.tsv')

In [None]:
df = df.drop(columns = ['InChIKeys'])

In [None]:
df.head()

In [None]:
df = df.drop_duplicates(subset = 'PCIDs')

In [None]:
df.shape

## Get available fingerprints and Add to molecule column

In [None]:
pybel.fps

In [None]:
molecules = [pybel.readstring('smi',x) for x in df['SMILEs']]

In [None]:
df.loc[:,'Molecule'] = pd.Series(molecules, index=df.index)

In [None]:
df.head()

In [None]:
df.shape

## Fingerprint calculations

In [None]:
ecfp0 = [x.calcfp(fptype = 'ecfp0') for x in df['Molecule']]
ecfp10 = [x.calcfp(fptype = 'ecfp10') for x in df['Molecule']]
ecfp2 = [x.calcfp(fptype = 'ecfp2') for x in df['Molecule']]
ecfp4 = [x.calcfp(fptype = 'ecfp4') for x in df['Molecule']]
ecfp6 = [x.calcfp(fptype = 'ecfp6') for x in df['Molecule']]
ecfp8 = [x.calcfp(fptype = 'ecfp8') for x in df['Molecule']]
fp2 = [x.calcfp(fptype = 'fp2') for x in df['Molecule']]
fp3 = [x.calcfp(fptype = 'fp3') for x in df['Molecule']]
fp4 = [x.calcfp(fptype = 'fp4') for x in df['Molecule']]
maccs = [x.calcfp(fptype = 'maccs') for x in df['Molecule']] 

In [None]:
ecfp0_list = []
ecfp10_list = []
ecfp2_list= []
ecfp4_list= []
ecfp6_list= []
ecfp8_list= []
fp2_list= []
fp3_list= []
fp4_list= []
maccs_list= []

for i in range(len(ecfp0)):
    for e in range(len(ecfp0)):
        ecfp0_list.append(ecfp0[i]|ecfp0[e])
        ecfp10_list.append(ecfp10[i]|ecfp10[e])
        ecfp2_list.append(ecfp2[i]|ecfp2[e])
        ecfp4_list.append(ecfp4[i]|ecfp4[e])
        ecfp6_list.append(ecfp6[i]|ecfp6[e])
        ecfp8_list.append(ecfp8[i]|ecfp8[e])
        fp2_list.append(fp2[i]|fp2[e])
        fp3_list.append(fp3[i]|fp3[e])
        fp4_list.append(fp4[i]|fp4[e])
        maccs_list.append(maccs[i]|maccs[e])

In [None]:
ecfp0_array = np.asarray(ecfp0_list)
ecfp10_array = np.asarray(ecfp10_list)
ecfp2_array= np.asarray(ecfp2_list)
ecfp4_array= np.asarray(ecfp4_list)
ecfp6_array= np.asarray(ecfp6_list)
ecfp8_array= np.asarray(ecfp8_list)
fp2_array= np.asarray(fp2_list)
fp3_array= np.asarray(fp3_list)
fp4_array= np.asarray(fp4_list)
maccs_array= np.asarray(maccs_list)

In [None]:
arrays_list = [ecfp0_array,ecfp10_array,ecfp2_array,ecfp4_array,ecfp6_array,ecfp8_array,fp2_array,fp3_array,fp4_array,maccs_array]

In [None]:
fps_dict = {}
for i in range(len(pybel.fps)):
    fps_dict[pybel.fps[i]]=arrays_list[i]

In [None]:
matrix = []
for array in arrays_list:
    row = []
    for sec_array in arrays_list:
        row.append(pearsonr(array, sec_array)[0])
    matrix.append(row)

In [None]:
array_matrix=np.array([np.array(xi) for xi in matrix])

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(array_matrix)
fig.colorbar(cax)

plt.xticks(range(10),pybel.fps, rotation = 90)
plt.yticks(range(10),pybel.fps)

plt.show()

## Fingerprint incorporated with Phenotypes

In [None]:
## Load Data
names = []
files = []

for filename in os.listdir('All_bmat/'):
    names.append(filename[:-12])
    name_file_df = pd.read_table('All_bmat/'+ filename)
    files.append(name_file_df.set_index(name_file_df.columns[0]))

namefiles_dict=dict(zip(names, files))

In [None]:
## Functions that get similarity
def pairwise_dis(df, metric):
    array_matrix = metrics.pairwise_distances(df, metric = metric)
    return array_matrix

def vector(array_matrix):
    return squareform(array_matrix)

In [None]:
# Define the DF
fps_pheno_df = pd.DataFrame(0, index = names + pybel.fps, columns = names + pybel.fps)
fps_pheno_df.head()

In [None]:
## Similarity for resources
for dbname1 in names:
    for dbname2 in names:
        db1, db2 = namefiles_dict[dbname1].T, namefiles_dict[dbname2].T
        shared = sorted(list(set(db1.index) & set(db2.index)))
        db1 = db1.loc[shared]
        db2 = db2.loc[shared]
        db1_vect = vector(pairwise_dis(db1, 'manhattan'))
        db2_vect = vector(pairwise_dis(db2, 'manhattan'))
        p_coeff = pearsonr(db1_vect, db2_vect)[0]
        fps_pheno_df.loc[dbname1, dbname2] = p_coeff

In [None]:
## Similarity for fingerprints
for fps1 in pybel.fps:
    for fps2 in pybel.fps:
        sim = pearsonr(fps_dict[fps1], fps_dict[fps2])[0]
        fps_pheno_df.loc[fps1, fps2] = sim

In [None]:
df['PCIDs'] = df['PCIDs'].astype(str)
df.set_index("PCIDs", inplace = True)
df.head()

In [None]:
## Similarity between fingerprints and resources
for dbname in names:
    for fp in pybel.fps:
        if fps_pheno_df.loc[dbname, fp] != 0:
            continue
        
        #creating vector for the phenotype
        db = namefiles_dict[dbname].T
        shared = sorted(list(set(db.index) & set(df.index)))
        db = db.loc[shared]
        db_vect = vector(pairwise_dis(db, 'manhattan'))
        
        #creating vector for the fingerprint
        df_copy = df.copy()
        df_copy = df_copy.loc[shared]
        calc_fp = [x.calcfp(fptype = fp) for x in df_copy['Molecule']]
        sim_df = pd.DataFrame(0, index = range(len(calc_fp)), columns = range(len(calc_fp)))
        for i in range(len(calc_fp)):
            for e in range(len(calc_fp)):
                sim_df.loc[i,e] = 1 - (calc_fp[i] | calc_fp[e])
        fp_vect = vector(sim_df)
        
        #fine pearson coeff and add to df
        p_coeff = pearsonr(db_vect, fp_vect)[0]
        fps_pheno_df.loc[dbname, fp] = p_coeff
        fps_pheno_df.loc[fp, dbname] = p_coeff

In [None]:
fps_pheno_cross_df = fps_pheno_df.loc['DrugBank_Targets':'Matador_Targets', 'ecfp0':'maccs']

## Plots for full df

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(fps_pheno_df)
fig.colorbar(cax)

plt.xticks(range(len(names + pybel.fps)),names+pybel.fps, rotation = 90)
plt.yticks(range(len(names + pybel.fps)),names+pybel.fps)
plt.show()

In [None]:
sns.heatmap(fps_pheno_df)

In [None]:
sns.clustermap(fps_pheno_df)

## Plot for only cross df

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(fps_pheno_cross_df)
fig.colorbar(cax)

plt.xticks(range(len(pybel.fps)),pybel.fps, rotation = 90)
plt.yticks(range(len(names)),names)
plt.show()

In [None]:
sns.heatmap(fps_pheno_cross_df)

In [None]:
sns.clustermap(fps_pheno_cross_df)