In [5]:
!pip install rdkit-pypi
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, AllChem, PandasTools, Descriptors
from rdkit.Chem.Draw import IPythonConsole
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt




In [8]:
df2 = pd.read_csv('chembl_drugs.txt.gz', sep='\t')

In [9]:
df2 = df2[df2["CANONICAL_SMILES"].notnull()]
df2 = df2[df2["USAN_STEM"].notnull()]
df2 = df2[df2["RULE_OF_FIVE"]== 'Y']
df2 = df2[df2["DEVELOPMENT_PHASE"] == 4]
len(df2)

700

In [10]:
PandasTools.AddMoleculeColumnToFrame(df2, smilesCol='CANONICAL_SMILES')
df3 = df2.sample(200)
df3.head()

Unnamed: 0,PARENT_MOLREGNO,CHEMBL_ID,SYNONYMS,DEVELOPMENT_PHASE,RESEARCH_CODES,APPLICANTS,USAN_STEM,USAN_STEM_DEFINITION,USAN_STEM_SUBSTEM,USAN_YEAR,...,ORAL,PARENTERAL,TOPICAL,BLACK_BOX,AVAILABILITY_TYPE,WITHDRAWN_YEAR,WITHDRAWN_COUNTRY,WITHDRAWN_REASON,CANONICAL_SMILES,ROMol
11010,1078456,CHEMBL1697686,"Cloforex (INN, MI)",4,D-237,,-orex,anorexiants,-orex,,...,N,N,N,N,Withdrawn,1969.0,Germany,Cardiovascular,CCOC(=O)NC(C)(C)Cc1ccc(Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001F7C30...
513,258709,CHEMBL1305,"Antazoline (BAN, INN); Antazoline HCl (MI, USP...",4,,Novartis Pharmaceuticals Corp,-azoline,antihistamines/local vasoconstrictors (antazol...,-azoline,,...,N,N,Y,N,Discontinued,,,,C(N(Cc1ccccc1)c2ccccc2)C3=NCCN3,<rdkit.Chem.rdchem.Mol object at 0x000001F7C28...
11265,155045,CHEMBL1113,"Amoxapine (BAN, FDA, INN, JAN, USAN, USP)",4,CL-67772,Lederle Laboratories Div American Cyanamid Co,-pin(e),tricyclic compounds,-pin(e),1971.0,...,Y,N,N,Y,Prescription-only,,,,Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1,<rdkit.Chem.rdchem.Mol object at 0x000001F7C30...
1345,394231,CHEMBL1466,"Dicoumarol (DCF, INN); Dicumarol (FDA, USAN, USP)",4,,Eli Lilly And Co; Abbvie Inc,-arol,anticoagulants (dicumarol type),-arol,1971.0,...,Y,N,N,N,Discontinued,,,,OC1=C(CC2=C(O)c3ccccc3OC2=O)C(=O)Oc4ccccc14,<rdkit.Chem.rdchem.Mol object at 0x000001F7C30...
11328,75901,CHEMBL888,"Gemcitabine (BAN, INN, USAN); Gemcitabine HCl ...",4,LY-188011,Lilly; Hospira Inc; Eli Lilly And Co,-citabine,"nucleoside antiviral or antineoplastic agents,...",-citabine,1989.0,...,N,Y,N,N,Prescription-only,,,,NC1=NC(=O)N(C=C1)[C@@H]2O[C@H](CO)[C@@H](O)C2(F)F,<rdkit.Chem.rdchem.Mol object at 0x000001F7C30...


In [11]:
common_stems = df2.groupby('USAN_STEM').size().sort_values()[-10:]
common_stems

USAN_STEM
-tinib     12
-olone     12
-pin(e)    12
-ium       14
-vir       15
-caine     15
-oxacin    17
-cillin    17
-olol      18
sulfa-     18
dtype: int64

In [17]:
df_small = df2[df2['USAN_STEM'].isin(list(common_stems.index))].copy()

In [18]:
def get_cfp(mol):
    arr = np.zeros((1,), dtype=float)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr


In [19]:
df_small['FP'] = df_small['ROMol'].map(get_cfp)
df_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 11378
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PARENT_MOLREGNO       150 non-null    int64  
 1   CHEMBL_ID             150 non-null    object 
 2   SYNONYMS              150 non-null    object 
 3   DEVELOPMENT_PHASE     150 non-null    int64  
 4   RESEARCH_CODES        103 non-null    object 
 5   APPLICANTS            142 non-null    object 
 6   USAN_STEM             150 non-null    object 
 7   USAN_STEM_DEFINITION  150 non-null    object 
 8   USAN_STEM_SUBSTEM     150 non-null    object 
 9   USAN_YEAR             108 non-null    float64
 10  FIRST_APPROVAL        138 non-null    float64
 11  ATC_CODE              133 non-null    object 
 12  ATC_CODE_DESCRIPTION  133 non-null    object 
 13  INDICATION_CLASS      104 non-null    object 
 14  SC_PATENT_NO          53 non-null     object 
 15  DRUG_TYPE            

In [22]:
print(df_small.iloc[43,30].nonzero()) #Показали фингерпринт одного случайного лекарства

(array([  33,   36,   37,   47,   71,  105,  130,  138,  231,  250,  301,
        312,  314,  356,  395,  412,  432,  437,  453,  507,  516,  519,
        522,  528,  529,  555,  594,  622,  627,  650,  652,  674,  694,
        751,  759,  803,  807,  922,  926,  940,  956, 1019], dtype=int64),)


In [36]:
X = np.array([x for x in df_small['FP']])
print(X.shape)
print(list(common_stems.index))

(150, 1024)
['-tinib', '-olone', '-pin(e)', '-ium', '-vir', '-caine', '-oxacin', '-cillin', '-olol', 'sulfa-']


In [43]:
def get_y(s):
    return (list(common_stems.index)).index(s)
Y = np.zeros(len(X))
for i  in range(len(X)):
    Y[i] = get_y(df_small.iloc[i,6])
print(Y)
Y = Y.reshape(-1,1)
Y.shape

[1. 6. 2. 0. 5. 9. 4. 3. 6. 1. 6. 0. 7. 1. 7. 4. 9. 8. 4. 9. 4. 4. 8. 9.
 6. 6. 5. 2. 2. 8. 3. 6. 6. 0. 9. 7. 9. 8. 1. 8. 7. 7. 8. 1. 5. 0. 7. 8.
 4. 6. 8. 0. 2. 9. 0. 8. 5. 5. 9. 3. 2. 9. 9. 3. 9. 1. 9. 1. 4. 9. 6. 5.
 9. 1. 6. 5. 0. 1. 4. 0. 3. 6. 3. 8. 8. 0. 4. 4. 8. 5. 7. 3. 6. 2. 5. 4.
 7. 7. 6. 7. 7. 2. 5. 8. 4. 7. 3. 4. 6. 6. 8. 3. 2. 7. 8. 8. 5. 1. 0. 3.
 5. 1. 6. 9. 8. 3. 3. 7. 2. 7. 7. 4. 0. 7. 9. 2. 8. 6. 5. 0. 2. 9. 1. 5.
 3. 2. 3. 4. 9. 5.]


(150, 1)

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=1239, train_size=0.8, stratify=Y)

In [87]:
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#scaler.fit(X_train)
#X_train_S = scaler.transform(X_train)
#X_test_S = scaler.transform(X_test)
clf4 = SVC()
clf4.fit(X_train, Y_train)
Y_pred = clf4.predict(X_test)
metrics.accuracy_score(Y_test, Y_pred)
print(Y_pred)

[8. 7. 5. 4. 2. 8. 9. 1. 3. 9. 0. 8. 8. 6. 9. 2. 7. 4. 9. 5. 1. 3. 6. 6.
 0. 0. 5. 7. 6. 3.]


  y = column_or_1d(y, warn=True)


In [73]:
from sklearn.linear_model import LogisticRegression

clf6 = LogisticRegression()
clf6.fit(X_train, Y_train)
Y_pred = clf6.predict(X_test)
metrics.accuracy_score(Y_test, Y_pred)

  y = column_or_1d(y, warn=True)


0.9666666666666667