# Classification using BFGS -- Pytorch version

This notebook details the implementation of a generic ridge-regularized classification solved by direct gradient-based optimization (here quasi-newton). 
It is implemented in the kernel space, i.e. representing the weights over the space of points.

In [11]:
%load_ext autoreload
%autoreload 2
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")
device_cpu = device
print( device )

mytype = torch.float16 # to save memory (only on GPU)
mytype = torch.float32

ModuleNotFoundError: No module named 'torch'

# Data

In [2]:
%load_ext autoreload
%autoreload 2

import utils
from utils import load_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
df_DB = load_data()
print(df_DB.shape)
df_DB.head()

(13717, 7)


Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,213,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,686,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
2,A1L3X4,DB12965,[Ag],4672,MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,462,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,4467,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1


In [23]:
df_DB[df_DB["smiles"] =="NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O"]
# drop the 2 molecules with the same smiles
df_DB = df_DB.drop(12222)
df_DB[df_DB["smiles"] =="NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O"]


Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter


In [24]:
df_DB.shape

(13716, 7)

In [25]:
df_DB.to_csv('data/drugbank.csv', index=False)

## Kprot

In [26]:
import pickle
with open('data/drugbank_K_prot.data', 'rb') as f:
        K_prot = pickle.load(f)

In [27]:
K_prot.shape

(2513, 2513)

## liste des 4814 smiles

In [28]:
# same in zip format
import pandas as pd
import zipfile
zf = zipfile.ZipFile('data/drugbank.csv.zip') 
df = pd.read_csv(zf.open('drugbank.csv'),low_memory=False)
df_p = df[df['inter'] == True]
#list of smiles strings
smiles = df_p['smiles'].drop_duplicates().values
len(smiles)

4813

In [29]:
from rdkit import Chem
from rdkit.Chem import AllChem

import numpy as np

nM =  len(smiles)
MorganFP = np.zeros((nM,1024))
for i in range(nM):
    # Convert SMILES to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles[i])    
    # Generate Morgan fingerprint of the molecule
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    # Convert the fingerprint to a numpy array
    arr = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
    MorganFP[i,:] = arr
MorganFP = MorganFP.astype(int)

[22:22:02] Unusual charge on atom 0 number of radical electrons set to zero


In [30]:
import Nystrom_method
from  Nystrom_method import nystroem,KronKernel
# random list of molecules 
kM = 4814 # number of molecule to compute nystrom
rM = 1000 # final dimension of features
I = np.random.permutation(nM)
I = I[:kM]

In [31]:
# compute Tanimoto kernel 
Km = ( MorganFP[I,:] @ MorganFP.T ) / ( 1024 - (1-MorganFP[I,:]) @ (1-MorganFP.T) )

In [32]:
Xm,Lambda,LambdaC = nystroem(np.concatenate((Km[:,I], Km), axis=1),rM)

## liste des indices protéines/molécules avec que des 1

In [34]:
# protein indices
J = df_p['ind2prot'].values
print(len(J))
# molecules indices
I = df_p['ind2mol'].values
print(len(I))

13716
13716


## train/test avec indices protéines/molécules et interactions balanced

en premier l'indice de la protéine, puis l'indice du ligand puis l'interaction

In [39]:
# change name of the column 'ind2prot' in 'indfasta' in df
df_name = df.rename(columns={'ind2prot': 'indfasta', 'ind2mol': 'indsmiles', 'inter': 'score'})

In [40]:
from utils import make_train_test

all_train_interactions_arr, all_test_interactions_arr = make_train_test(df_name,5,1)

train (21944, 3)
test (5488, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
Train/test datasets prepared.
