# Chemical Space


Chemical space refers to arrangements of compounds in an n-dimensional space at some scale. In general, 
two or three dimensions are often used (for human understanding). Although various methods have been proposed 
for the scale i.e. similarity, it is often decided that a distance that well characterizes compound is defined


This time, we will visualize which pharmaceutical company is developing what kind of compound for the antagonist 
of Orexin Receptor, which is known as a target for sleep medicine. 

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import os

In [3]:
oxrs = [("CHEMBL3098111", "Merck" ),("CHEMBL3867477", "Merck" ),
("CHEMBL2380240", "Rottapharm" ),("CHEMBL3352684", "Merck" ),
("CHEMBL3769367", "Merck" ),("CHEMBL3526050", "Actelion" ),
("CHEMBL3112474", "Actelion" ),("CHEMBL3739366", "Heptares" ),
("CHEMBL3739395", "Actelion" ), ("CHEMBL3351489", "Eisai" )]

In [6]:
fps = []
docs = []
companies = []

for cid,company in oxrs:
    sdf_file = os.path.join('/home/oohnohnoh1/Desktop/GIT/Chemiinformatics_work/py4chemoinformatics/notebooks/ch08/' +  cid + '.sdf')
    mols = Chem.SDMolSupplier(sdf_file)
    for mol in mols:
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
            arr = np.zeros((1,))
            DataStructs.ConvertToNumpyArray(fp,arr)
            docs.append(cid)
            companies.append(company)
            fps.append(arr)

fps = np.array(fps)
companies = np.array(companies)
docs = np.array(docs)
            

In [7]:
fps.shape

(293, 2048)

In [8]:
pca = PCA(n_components = 2)

In [9]:
pca

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [10]:
fps

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
x = pca.fit_transform(fps)

In [12]:
x

array([[-0.66925392,  3.22891897],
       [ 0.32815732,  2.63548819],
       [ 0.32968343,  2.13661915],
       [-0.40011815,  2.87836786],
       [ 0.20176745,  3.00920926],
       [ 0.16333224,  2.92680382],
       [-0.4365177 ,  2.90944492],
       [ 0.23035557,  2.98608053],
       [-0.65095276,  2.70337661],
       [ 0.13639894,  3.03869324],
       [-0.73150267,  2.800221  ],
       [-0.63870741,  3.1754935 ],
       [-0.83701326,  2.74171214],
       [-0.76519465,  2.87643774],
       [-0.68836306,  2.71764548],
       [-0.76575336,  2.87912422],
       [ 0.25370065,  2.16274318],
       [ 3.65161608, -2.38573211],
       [ 3.30436589, -2.66009026],
       [ 3.48734348, -2.58271908],
       [ 4.14694844, -3.14551864],
       [ 0.65295009,  0.74299883],
       [ 3.4629036 , -2.3409921 ],
       [ 3.65327467, -2.64705712],
       [ 3.49616659, -2.19120086],
       [ 2.84254786, -2.76565337],
       [ 3.39671311, -2.11051921],
       [ 3.38709525, -2.29144804],
       [ 3.34008333,