# Classification using BFGS -- Pytorch version

This notebook details the implementation of a generic ridge-regularized classification solved by direct gradient-based optimization (here quasi-newton). 
It is implemented in the kernel space, i.e. representing the weights over the space of points.

In [11]:
%load_ext autoreload
%autoreload 2
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")
device_cpu = device
print( device )

mytype = torch.float16 # to save memory (only on GPU)
mytype = torch.float32

ModuleNotFoundError: No module named 'torch'

# Data

## Kprot

In [12]:
import pickle
with open('data/CC_all2_base_K_prot.data', 'rb') as f:
        K_prot = pickle.load(f)

In [13]:
K_prot.shape

(1647, 1647)

## liste des 152 844 smiles

In [14]:
# same in zip format
import pandas as pd
import zipfile
zf = zipfile.ZipFile('data/Consensus_CompoundBioactivity_Dataset_v1.1_Sh2_all2.csv.zip') 
df = pd.read_csv(zf.open('Consensus_CompoundBioactivity_Dataset_v1.1_Sh2_all2.csv'),low_memory=False)
df_p = df[df['interaction+'] == True]
#list of smiles strings
smiles = df_p['standardized smiles'].drop_duplicates().values
len(smiles)

152844

In [15]:
from rdkit import Chem
from rdkit.Chem import AllChem

import numpy as np

nM = 10000 # len(smiles)
MorganFP = np.zeros((nM,1024))
for i in range(nM):
    # Convert SMILES to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles[i])    
    # Generate Morgan fingerprint of the molecule
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    # Convert the fingerprint to a numpy array
    arr = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
    MorganFP[i,:] = arr
MorganFP = MorganFP.astype(int)

In [16]:
import Nystrom_method
from  Nystrom_method import nystroem,KronKernel
# random list of molecules 
kM = 3000 # number of molecule to compute nystrom
rM = 1000 # final dimension of features
I = np.random.permutation(nM)
I = I[:kM]

In [17]:
# compute Tanimoto kernel 
Km = ( MorganFP[I,:] @ MorganFP.T ) / ( 1024 - (1-MorganFP[I,:]) @ (1-MorganFP.T) )

In [18]:
Xm,Lambda,LambdaC = nystroem(np.concatenate((Km[:,I], Km), axis=1),rM)

## liste des indices protéines/molécules avec que des 1

In [5]:
# protein indices
J = df_p['indfasta'].values
print(len(J))
# molecules indices
I = df_p['indsmiles'].values
print(len(I))

231964
231964


## train/test avec indices protéines/molécules et interactions balanced

In [8]:
# load with pickle
import pickle
with open('data/train.data', 'rb') as f:
        train = pickle.load(f)
with open('data/test.data', 'rb') as f:
        test = pickle.load(f)

print(len(train))

371142


In [9]:
# en premier l'indice de la protéine, puis l'indice du ligand puis l'interaction
train

array([[   145, 124184,      1],
       [   145, 125039,      1],
       [   145,  48320,      1],
       ...,
       [  1643, 150939,      0],
       [  1647, 152928,      1],
       [  1647, 126691,      0]])

In [20]:
from utils import make_train_test
all_train_interactions_arr, all_test_interactions_arr = make_train_test(df,5,1)

train (366418, 3)
test (91606, 3)
train (366418, 3)
test (91606, 3)
train (366420, 3)
test (91604, 3)
train (366420, 3)
test (91604, 3)
train (366420, 3)
test (91604, 3)
Train/test datasets prepared.


In [21]:
with open('data/CC_train_arr.data', 'wb') as f:
    pickle.dump(all_train_interactions_arr, f)
with open('data/CC_test_arr.data', 'wb') as f:
    pickle.dump(all_test_interactions_arr, f)