# Classification using BFGS -- Pytorch version

This notebook details the implementation of a generic ridge-regularized classification solved by direct gradient-based optimization (here quasi-newton). 
It is implemented in the kernel space, i.e. representing the weights over the space of points.

In [2]:
%load_ext autoreload
%autoreload 2
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")
device_cpu = device
print( device )

mytype = torch.float16 # to save memory (only on GPU)
mytype = torch.float32

cpu


# Data

In [2]:
%load_ext autoreload
%autoreload 2

import utils
from utils import load_data

In [3]:
df_DB = load_data()
print(df_DB.shape)
df_DB.head()

(13717, 7)


Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,213,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,686,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
2,A1L3X4,DB12965,[Ag],4672,MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,462,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,4467,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1


In [8]:
print(df_DB[df_DB["smiles"] =="NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O"])
# drop the molecule with the error in rdkit
df_DB = df_DB.drop(12222)
df_DB[df_DB["smiles"] =="NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O"]


      uniprot     DBid                                  smiles  ind2mol  \
12222  Q86XF0  DB01929  NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O     1169   

                                                   fasta  ind2prot  inter  
12222  MFLLLNCIVAVSQNMGIGKNGDLPRPPLRNEFRYFQRMTTTSSVEG...      1989      1  


Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter


In [6]:
df_DB.shape

(13716, 7)

In [20]:
# il faut renuméroter in2mol car il manque le numéro 1169
np.sort(df_DB["ind2mol"].unique()) == np.arange(0,df_DB["ind2mol"].unique().shape[0])

array([ True,  True,  True, ..., False, False, False])

In [22]:
# make dict smiles2ind and dict ind2smiles
df_sm = df_DB[["smiles"]].drop_duplicates().reset_index()
#df_sm = df_DB[["standardized smiles"]].drop_duplicates().reset_index()
df_sm.drop(columns=["index"],inplace=True)
dict_ind2smiles = df_sm.to_dict()["smiles"]
#dict_ind2smiles = df_sm.to_dict()["standardized smiles"]
print("nombre de smiles: ",len(dict_ind2smiles))
dict_smiles2ind = {v: k for k, v in dict_ind2smiles.items()}

nombre de smiles:  4813


In [25]:
# add this number to df_DB
df_DB["indsmiles"] = df_DB["smiles"].map(dict_smiles2ind)
# we drop when indsmiles is Nan
indsmiles_index_with_nan = df_DB.index[df_DB.loc[:,"indsmiles"].isnull()]
df_DB = df_DB.drop(indsmiles_index_with_nan,0)
df_DB["indsmiles"] = df_DB["indsmiles"].astype(int)
df_DB.head()

Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter,indsmiles
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,213,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1,0
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,686,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1,1
2,A1L3X4,DB12965,[Ag],4672,MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1,2
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,462,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1,3
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,4467,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1,4


In [21]:
# pas de pb pour les protéines
np.sort(df_DB["ind2prot"].unique()) == np.arange(0,df_DB["ind2prot"].unique().shape[0])

array([ True,  True,  True, ...,  True,  True,  True])

In [26]:
# on enleve la colonne ind2mol 
df_DB.drop(columns=["ind2mol"],inplace=True)
df_DB.head()

Unnamed: 0,uniprot,DBid,smiles,fasta,ind2prot,inter,indsmiles
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1,0
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1,1
2,A1L3X4,DB12965,[Ag],MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1,2
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1,3
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1,4


In [27]:
# on renomme ind2prot en indfasta
df_DB.rename(columns={"ind2prot":"indfasta"},inplace=True)
df_DB.head()


Unnamed: 0,uniprot,DBid,smiles,fasta,indfasta,inter,indsmiles
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1,0
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1,1
2,A1L3X4,DB12965,[Ag],MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1,2
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1,3
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1,4


In [28]:
df_DB.to_csv('data/drugbank.csv', index=False)

## Kprot

In [26]:
import pickle
with open('data/drugbank_K_prot.data', 'rb') as f:
        K_prot = pickle.load(f)

In [27]:
K_prot.shape

(2513, 2513)

## liste des 4814 smiles

In [28]:
# same in zip format
import pandas as pd
import zipfile
zf = zipfile.ZipFile('data/drugbank.csv.zip') 
df = pd.read_csv(zf.open('drugbank.csv'),low_memory=False)
df_p = df[df['inter'] == True]
#list of smiles strings
smiles = df_p['smiles'].drop_duplicates().values
len(smiles)

4813

In [29]:
from rdkit import Chem
from rdkit.Chem import AllChem

import numpy as np

nM =  len(smiles)
MorganFP = np.zeros((nM,1024))
for i in range(nM):
    # Convert SMILES to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles[i])    
    # Generate Morgan fingerprint of the molecule
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    # Convert the fingerprint to a numpy array
    arr = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
    MorganFP[i,:] = arr
MorganFP = MorganFP.astype(int)

[22:22:02] Unusual charge on atom 0 number of radical electrons set to zero


In [30]:
import Nystrom_method
from  Nystrom_method import nystroem,KronKernel
# random list of molecules 
kM = 4814 # number of molecule to compute nystrom
rM = 1000 # final dimension of features
I = np.random.permutation(nM)
I = I[:kM]

In [31]:
# compute Tanimoto kernel 
Km = ( MorganFP[I,:] @ MorganFP.T ) / ( 1024 - (1-MorganFP[I,:]) @ (1-MorganFP.T) )

In [32]:
Xm,Lambda,LambdaC = nystroem(np.concatenate((Km[:,I], Km), axis=1),rM)

## liste des indices protéines/molécules avec que des 1

In [34]:
# protein indices
J = df_p['ind2prot'].values
print(len(J))
# molecules indices
I = df_p['ind2mol'].values
print(len(I))

13716
13716


## train/test avec indices protéines/molécules et interactions balanced

en premier l'indice de la protéine, puis l'indice du ligand puis l'interaction

In [7]:
# change name of the column 'ind2prot' in 'indfasta' in df
df = df_DB.rename(columns={'ind2prot': 'indfasta', 'ind2mol': 'indsmiles', 'inter': 'score'})
df.head()

Unnamed: 0,uniprot,DBid,smiles,indsmiles,fasta,indfasta,score
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,213,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,686,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
2,A1L3X4,DB12965,[Ag],4672,MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,462,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,4467,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1


In [30]:
from utils import make_train_test

all_train_interactions_arr, all_test_interactions_arr = make_train_test(df_name,5,1)

train (21944, 3)
test (5488, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
Train/test datasets prepared.


In [29]:
from utils import make_train_test_mol_orphan

all_train_interactions_arr, all_test_interactions_arr = make_train_test_mol_orphan(df,5)

train (21946, 3)
test (5488, 3)
train (21946, 3)
test (5488, 3)
train (21948, 3)
test (5486, 3)
train (21948, 3)
test (5486, 3)
train (21948, 3)
test (5486, 3)
Train/test datasets prepared.


In [30]:
from utils import make_train_test_prot_orphan

all_train_interactions_arr, all_test_interactions_arr = make_train_test_prot_orphan(df,5)

train (21946, 3)
test (5488, 3)
train (21946, 3)
test (5488, 3)
train (21948, 3)
test (5486, 3)
train (21948, 3)
test (5486, 3)
train (21948, 3)
test (5486, 3)
Train/test datasets prepared.


## test de l'algo de train/test

In [31]:
# algo Matthieu corrected
import numpy as np
intMat = df.pivot(index='indfasta', columns="indsmiles", values='score').to_numpy(dtype=np.float16)
intMat

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float16)

In [10]:
n_p,n_m = intMat.shape
Ip, Jm = np.where(intMat==1)
print(Ip,Jm,intMat[0,213])
nb_positive_inter = int(len(Ip))
Inp, Jnm = np.where(intMat==0)
Inkp, Jnkm = np.where(np.isnan(intMat))
print(Inkp,Jnkm,intMat[0,0])


[   0    0    1 ... 2512 2512 2512] [ 213  686 4672 ... 1564 1671 3948] 1.0
[   0    0    0 ... 2512 2512 2512] [   0    1    2 ... 4812 4813 4814] nan


In [13]:
#from sklearn import model_selection
from sklearn.model_selection import GroupKFold
groups = np.array(Ip)
group_kfold = GroupKFold(n_splits=5)
#skf_positive = model_selection.KFold(shuffle=True, n_splits=5)
for train_index, test_index in group_kfold.split(range(nb_positive_inter), groups=groups):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [    0     1     2 ... 13709 13710 13712] TEST: [    3     4     5 ... 13714 13715 13716]
TRAIN: [    0     1     2 ... 13714 13715 13716] TEST: [    9    10    11 ... 13708 13709 13710]
TRAIN: [    0     1     2 ... 13714 13715 13716] TEST: [   31    32    33 ... 13699 13705 13712]
TRAIN: [    2     3     4 ... 13714 13715 13716] TEST: [    0     1    26 ... 13680 13682 13701]
TRAIN: [    0     1     3 ... 13714 13715 13716] TEST: [    2    24    25 ... 13702 13703 13704]


In [14]:
Ip[train_index],Jm[train_index],intMat[0,213]

(array([   0,    0,    2, ..., 2512, 2512, 2512]),
 array([ 213,  686,  462, ..., 1564, 1671, 3948]),
 1.0)

In [15]:
Ip[test_index],Jm[test_index],intMat[2,4675]

(array([   1,    4,    4, ..., 2506, 2506, 2506]),
 array([4672, 4369, 4740, ...,  863,  887, 2668]),
 nan)

In [16]:
Mm, bin_edges = np.histogram(Ip[train_index], bins = range(n_p+1)) 
Mp, bin_edges = np.histogram(Jm[train_index], bins = range(n_m+1))
train = np.zeros([1,3], dtype=int)
nb_prot = len(list(set(Ip[train_index])))
nb_prot

2010

In [17]:
for i in range(nb_prot):

        j = np.argmax(Mm) # choose protein with the maximum of interactions in the train

        indice_P = Jm[train_index][np.where(Ip[train_index]==j)[0]]  #np.array with index of interactions + in train
        indice_N = [k for k in Jm[train_index] if intMat[j][k]==0]
        indice_NK = [k for k in Jm[train_index] if np.isnan(intMat[j][k])] #np.array  with index of interactions not known

        indice_freq_mol = np.where(Mp>1)[0]  #drug's index with more than 2 interactions +
        indice_poss_mol = np.where(Mp == 1)[0]  #drug's index with 1 interaction +

        indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
        indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)

        nb_positive_interactions = len(indice_P)
        nb_frequent_hitters_negative_interactions = len(indice_freq_one_prot)

        indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
        indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)
        indice_freq_one_prot_NK = np.intersect1d(indice_NK, indice_freq_mol)
        indice_poss_one_prot_NK = np.intersect1d(indice_NK, indice_poss_mol)

        if len(indice_P) <= len(indice_freq_one_prot):
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            indice_N_one_prot = np.random.choice(indice_freq_one_prot,
                                                len(indice_P), replace = False)
        elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot):
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot)
            indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot,
                                                    nb_negative_interactions_remaining, replace = False )
            indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                            indice_N_one_prot_poss))
        elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK):
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot)
            indice_N_one_prot_poss = np.random.choice(indice_freq_one_prot_NK,
                                                    nb_negative_interactions_remaining, replace = False )
            indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                            indice_poss_one_prot, indice_N_one_prot_poss))
        else:
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot) - len(indice_freq_one_prot_NK)
            #print("nb_negative_interactions_remaining", nb_negative_interactions_remaining) # pas de solution...
            #print(indice_poss_one_prot_NK.shape)
            indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot_NK,
                                                    nb_negative_interactions_remaining, replace = False )
            indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                            indice_poss_one_prot, indice_freq_one_prot_NK, indice_N_one_prot_poss))

        Mp[indice_N_one_prot.astype(int)]-=1

        # this protein has been processed
        Mm[j] = 0

        indice = np.r_[indice_P,indice_N_one_prot].astype(int)
        etiquette = [x if not np.isnan(x) else 0 for x in intMat[j][indice]]
        A = np.stack((indice, etiquette), axis=-1)
        B = np.c_[np.zeros(A.shape[0])+j,A].astype(int)
        train = np.concatenate((train,B))

        train = train[1:]

In [18]:
len(train)

19939

In [19]:
# test
#test_index =  np.random.choice(test_index, int(p*len(test_index)), replace=False)
# interactions + in test
indice_P_t = np.c_[Ip[test_index],Jm[test_index], np.ones(len(test_index))].astype(int)

# interactions - in test
a = np.r_[np.c_[Inp,Jnm]] # all the zeros in the matrix (and NK ?)
a1 = set(map(tuple, a))
b = train[:,:2]   # all the interactions in the train
b1 = set(map(tuple, b))
indice_N_t = np.array(list(a1 - b1))#[:indice_P_t.shape[0],:] # we keep the same number of interactions - than interactions + in test, choosing the 0 in the matrix
#print(len(indice_N_t))

# add interactions np.nan in test

if len(indice_N_t) == 0:
    # initialization
    indice_N_t = np.array([-1, -1]).reshape(1,2)

c = np.r_[np.c_[Inkp,Jnkm]] # all the np.nan in the matrix

if len(indice_N_t) < indice_P_t.shape[0]:
    # we add some interactions - in test to have the same number of interactions + and - in test choose in the np.nan in the matrix
    k = 0
    while len(indice_N_t) < indice_P_t.shape[0]+1:
        i = np.random.randint(0, len(c))
        if tuple(c[i]) not in b1:
            indice_N_t = np.concatenate((indice_N_t, c[i].reshape(1,2)))
            k += 1

# we drop the first row of indice_N_t if is [-1, -1]
if indice_N_t[0,0] == -1:
    indice_N_t = indice_N_t[1:,:]

indice_N_t = indice_N_t[:len(indice_P_t),:]

# we add the column of 0 for the etiquette
indice_N_t = np.c_[indice_N_t, np.zeros(len(indice_N_t))].astype(int)
test = np.r_[indice_P_t,indice_N_t]

      

In [20]:
np.where([0,213,1] in train),np.where([0,213,1] in test)
[0,213,1] in train,[0,213,1] in test

(True, True)

In [21]:
np.where([2,4675,1] in train),np.where([2,4675,1] in test)
#[2,4675,1] in train,[2,4675,1] in test

((array([0]),), (array([0]),))

In [22]:
Ip[test_index], Jm[test_index]

(array([   1,    4,    4, ..., 2506, 2506, 2506]),
 array([4672, 4369, 4740, ...,  863,  887, 2668]))

In [23]:
for elt in train:
    for x in test:
        if elt[0]==x[0] and elt[1]==x[1]:
            print(elt)


In [148]:
train[:,:2]

array([[ 740,  101],
       [ 740,  103],
       [ 740,  111],
       ...,
       [2510,  419],
       [2511, 4596],
       [2511,  750]])

In [156]:
for elt in test:
    if tuple(elt[:2]) in set(map(tuple, train[:,:2])):
        print(elt)


In [157]:
S_train = set(map(tuple, train))
S_test = set(map(tuple, test))
S_train.intersection(S_test)

set()

In [104]:
intMat = df.pivot(index='indfasta', columns="indsmiles", values='score').to_numpy(dtype=np.float16)
n_p,n_m = intMat.shape
Ip, Jm = np.where(intMat==1)
nb_positive_inter = int(len(Ip))
Inp, Jnm = np.where(intMat==0)
Inkp, Jnkm = np.where(np.isnan(intMat))

Mm, bin_edges = np.histogram(Ip[train_index], bins = range(n_p+1)) # np.array with  #interactions for each protein of the train at the beginning
Mp, bin_edges = np.histogram(Jm[train_index], bins = range(n_m+1)) # np.array with  #interactions for each drugs at the beginning (how manu time it can be chosen)
train = np.zeros([1,3], dtype=int)
nb_prot = len(list(set(Ip[train_index]))) # number of different prot in train
j = np.argmax(Mm)
print(j)
indice_P = Jm[train_index][np.where(Ip[train_index]==j)[0]]
indice_P

969


array([1158, 1208, 1227, 1251, 1268, 1297, 1338, 1395, 1455, 1486, 1507,
       1546, 1585, 1623, 1638, 1642, 1646, 1671, 1797, 1823, 1854, 1885,
       1888, 1920, 1994, 2113, 2151, 2185, 2232, 2384, 2411, 2414, 2756,
       2863, 2939, 3013, 3016, 3033, 3075, 3097, 3105, 3156, 3191, 3202,
       3234, 3392, 3420, 3433, 3439, 3442, 3458, 3460, 3461, 3462, 3465,
       3466, 3467, 3499, 3507, 3511, 3517, 3520, 3567, 3569, 3570, 3598,
       3613, 3614, 3616, 3620, 3640, 3641, 3709, 3738, 3770, 3838, 3858,
       3871, 3872, 3874, 3878, 3880, 3881, 3882, 3883, 3884, 3885, 3887,
       3888, 3918, 3949, 3957, 3961, 3962, 3979, 3995, 3996, 4081, 4093,
       4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4158, 4210, 4214,
       4224, 4257])

In [105]:
indice_N = [k for k in Jm[train_index] if intMat[j][k]==0]
indice_NK = [k for k in Jm[train_index] if np.isnan(intMat[j][k])] #np.array  with index of interactions not known

indice_freq_mol = np.where(Mp>1)[0]  #drug's index with more than 2 interactions +
indice_poss_mol = np.where(Mp == 1)[0]  #drug's index with 1 interaction +

indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)

nb_positive_interactions = len(indice_P)
nb_frequent_hitters_negative_interactions = len(indice_freq_one_prot)

indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)
indice_freq_one_prot_NK = np.intersect1d(indice_NK, indice_freq_mol)
indice_poss_one_prot_NK = np.intersect1d(indice_NK, indice_poss_mol)

In [106]:
print(len(indice_P),len(indice_freq_one_prot))
print(len(indice_P), len(indice_freq_one_prot) + len(indice_poss_one_prot))
print(len(indice_P) , len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK))

112 0
112 0
112 1479


In [108]:
if len(indice_P) <= len(indice_freq_one_prot):
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    indice_N_one_prot = np.random.choice(indice_freq_one_prot,
                                        len(indice_P), replace = False)
elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot):
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot)
    indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot,
                                            nb_negative_interactions_remaining, replace = False )
    indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                    indice_N_one_prot_poss))
elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK):
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot)
    indice_N_one_prot_poss = np.random.choice(indice_freq_one_prot_NK,
                                            nb_negative_interactions_remaining, replace = False ).astype(int)
    indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                    indice_poss_one_prot, indice_N_one_prot_poss))
    print(indice_N_one_prot)
else:
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot) - len(indice_freq_one_prot_NK)
    #print("nb_negative_interactions_remaining", nb_negative_interactions_remaining) # pas de solution...
    #print(indice_poss_one_prot_NK.shape)
    indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot_NK,
                                            nb_negative_interactions_remaining, replace = False )
    indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                    indice_poss_one_prot, indice_freq_one_prot_NK, indice_N_one_prot_poss))

Mp[indice_N_one_prot.astype(int)]-=1

[4446.  684.  483. 2714. 1409. 4739. 4445. 4380. 2560. 2825.  467.  117.
 1016.   55. 2334.  320. 3252.   10.  854.   72. 1379.  522. 4499. 2688.
  606.  283.  962. 2617. 4014.  997. 1031. 1115. 1024.   93. 1009.  702.
  686.  965.  499.  655. 4430. 4255. 4720. 3357.  828. 4716.  450.  250.
  992.  339.  644. 2587. 1232.  806. 4768. 1147.   20. 3389.  938.  948.
 4513.  374.   15.  514.  364. 2489.  236.  867. 4770. 2534. 1303.  318.
  723.  206. 4477. 2689.  105. 4304. 3581. 4191.  305.  837. 2899.  134.
 4741.  749. 2429. 1506. 1531.  672.  208.  630.  988.  772. 4637. 4614.
   61.  978.   42.  991. 2809.  176.  447. 1871.  407.  559. 1120.  360.
  133. 1037.  469. 4727.]


In [109]:
Mm[j] = 0

indice = np.r_[indice_P,indice_N_one_prot].astype(int)
etiquette = [x if not np.isnan(x) else 0 for x in intMat[j][indice]]
A = np.stack((indice, etiquette), axis=-1)
B = np.c_[np.zeros(A.shape[0])+j,A].astype(int)
train = np.concatenate((train,B))

In [110]:
train

array([[   0,    0,    0],
       [ 969, 1158,    1],
       [ 969, 1208,    1],
       [ 969, 1227,    1],
       [ 969, 1251,    1],
       [ 969, 1268,    1],
       [ 969, 1297,    1],
       [ 969, 1338,    1],
       [ 969, 1395,    1],
       [ 969, 1455,    1],
       [ 969, 1486,    1],
       [ 969, 1507,    1],
       [ 969, 1546,    1],
       [ 969, 1585,    1],
       [ 969, 1623,    1],
       [ 969, 1638,    1],
       [ 969, 1642,    1],
       [ 969, 1646,    1],
       [ 969, 1671,    1],
       [ 969, 1797,    1],
       [ 969, 1823,    1],
       [ 969, 1854,    1],
       [ 969, 1885,    1],
       [ 969, 1888,    1],
       [ 969, 1920,    1],
       [ 969, 1994,    1],
       [ 969, 2113,    1],
       [ 969, 2151,    1],
       [ 969, 2185,    1],
       [ 969, 2232,    1],
       [ 969, 2384,    1],
       [ 969, 2411,    1],
       [ 969, 2414,    1],
       [ 969, 2756,    1],
       [ 969, 2863,    1],
       [ 969, 2939,    1],
       [ 969, 3013,    1],
 

In [76]:
def make_train_test(df,nb_folds):
  """
    make train and test sets
  """

  # algo Matthieu corrected
  intMat = df.pivot(index='indfasta', columns="indsmiles", values='score').to_numpy(dtype=np.float16)

  # Set the different folds
  skf_positive = model_selection.KFold(shuffle=True, n_splits=nb_folds)

  all_train_interactions_arr = []
  all_test_interactions_arr = []

  n_p,n_m = intMat.shape
  Ip, Jm = np.where(intMat==1)
  nb_positive_inter = int(len(Ip))
  Inp, Jnm = np.where(intMat==0)
  Inkp, Jnkm = np.where(np.isnan(intMat))

  for train_index, test_index in skf_positive.split(range(nb_positive_inter)):
      # 9' pour train
      #train_index = np.random.choice(train_index, int(p*len(train_index)), replace=False)

      Mm, bin_edges = np.histogram(Ip[train_index], bins = range(n_p+1)) # np.array with  #interactions for each protein of the train at the beginning

      Mp, bin_edges = np.histogram(Jm[train_index], bins = range(n_m+1)) # np.array with  #interactions for each drugs at the beginning (how manu time it can be chosen)

      train = np.zeros([1,3], dtype=int)

      nb_prot = len(list(set(Ip[train_index]))) # number of different prot in train
      for i in range(nb_prot):

          j = np.argmax(Mm) # choose protein with the maximum of interactions in the train

          indice_P = Jm[train_index][np.where(Ip[train_index]==j)[0]]  #np.array with index of interactions + in train
          indice_N = [k for k in Jm[train_index] if intMat[j][k]==0]
          indice_NK = [k for k in Jm[train_index] if np.isnan(intMat[j][k])] #np.array  with index of interactions not known

          indice_freq_mol = np.where(Mp>1)[0]  #drug's index with more than 2 interactions +
          indice_poss_mol = np.where(Mp == 1)[0]  #drug's index with 1 interaction +

          indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
          indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)

          nb_positive_interactions = len(indice_P)
          nb_frequent_hitters_negative_interactions = len(indice_freq_one_prot)

          indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
          indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)
          indice_freq_one_prot_NK = np.intersect1d(indice_NK, indice_freq_mol)
          indice_poss_one_prot_NK = np.intersect1d(indice_NK, indice_poss_mol)

          if len(indice_P) <= len(indice_freq_one_prot):
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              indice_N_one_prot = np.random.choice(indice_freq_one_prot,
                                                  len(indice_P), replace = False)
          elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot):
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot)
              indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot,
                                                      nb_negative_interactions_remaining, replace = False )
              indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                              indice_N_one_prot_poss))
          elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK):
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot)
              indice_N_one_prot_poss = np.random.choice(indice_freq_one_prot_NK,
                                                      nb_negative_interactions_remaining, replace = False )
              indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                              indice_poss_one_prot, indice_N_one_prot_poss))
          else:
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot) - len(indice_freq_one_prot_NK)
              #print("nb_negative_interactions_remaining", nb_negative_interactions_remaining) # pas de solution...
              #print(indice_poss_one_prot_NK.shape)
              indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot_NK,
                                                      nb_negative_interactions_remaining, replace = False )
              indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                              indice_poss_one_prot, indice_freq_one_prot_NK, indice_N_one_prot_poss))

          Mp[indice_N_one_prot.astype(int)]-=1

          # this protein has been processed
          Mm[j] = 0

          indice = np.r_[indice_P,indice_N_one_prot].astype(int)
          etiquette = [x if not np.isnan(x) else 0 for x in intMat[j][indice]]
          A = np.stack((indice, etiquette), axis=-1)
          B = np.c_[np.zeros(A.shape[0])+j,A].astype(int)
          train = np.concatenate((train,B))

      train = train[1:]
      all_train_interactions_arr.append(train)
      print("train", train.shape)


      # test
      #test_index =  np.random.choice(test_index, int(p*len(test_index)), replace=False)
      # interactions + in test
      indice_P_t = np.c_[Ip[test_index],Jm[test_index], np.ones(len(test_index))].astype(int)

      # interactions - in test
      a = np.r_[np.c_[Inp,Jnm]] # all the zeros in the matrix (and NK ?)
      a1 = set(map(tuple, a))
      b = train[:,:2]   # all the interactions in the train
      b1 = set(map(tuple, b))
      indice_N_t = np.array(list(a1 - b1))#[:indice_P_t.shape[0],:] # we keep the same number of interactions - than interactions + in test, choosing the 0 in the matrix
      #print(len(indice_N_t))

      # add interactions np.nan in test

      if len(indice_N_t) == 0:
          # initialization
          indice_N_t = np.array([-1, -1]).reshape(1,2)

      c = np.r_[np.c_[Inkp,Jnkm]] # all the np.nan in the matrix

      if len(indice_N_t) < indice_P_t.shape[0]:
          # we add some interactions - in test to have the same number of interactions + and - in test choose in the np.nan in the matrix
          k = 0
          while len(indice_N_t) < indice_P_t.shape[0]+1:
              i = np.random.randint(0, len(c))
              if tuple(c[i]) not in b1:
                  indice_N_t = np.concatenate((indice_N_t, c[i].reshape(1,2)))
                  k += 1

      # we drop the first row of indice_N_t if is [-1, -1]
      if indice_N_t[0,0] == -1:
          indice_N_t = indice_N_t[1:,:]

      indice_N_t = indice_N_t[:len(indice_P_t),:]

      # we add the column of 0 for the etiquette
      indice_N_t = np.c_[indice_N_t, np.zeros(len(indice_N_t))].astype(int)
      test = np.r_[indice_P_t,indice_N_t]

      all_test_interactions_arr.append(test)
      print("test", test.shape)

  print("Train/test datasets prepared.")
  return all_train_interactions_arr, all_test_interactions_arr

## algo pour faire test/train avec des molécules orphelines et des protéines orphelines
## attention, indfasta différent et même fasta

In [36]:
print(df[["indfasta"]].drop_duplicates().reset_index().shape)
print(df[["fasta"]].drop_duplicates().reset_index().shape)

(2513, 2)
(2507, 2)


In [38]:
# on garde 20% des fastas pour le test
nP = df[["indfasta"]].drop_duplicates().reset_index().shape[0]
S = np.random.permutation(nP)

# on garde les pairs avec ces protéines
df_train = df[df["indfasta"].isin(S[:int(0.8*nP)])]
df_test = df[df["indfasta"].isin(S[int(0.8*nP):])]
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
print(df_train.shape)
print(df_test.shape)



(11024, 7)
(2693, 7)
