# Classification using BFGS -- Pytorch version

This notebook details the implementation of a generic ridge-regularized classification solved by direct gradient-based optimization (here quasi-newton). 
It is implemented in the kernel space, i.e. representing the weights over the space of points.

In [1]:
%load_ext autoreload
%autoreload 2
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")
device_cpu = device
print( device )

mytype = torch.float16 # to save memory (only on GPU)
mytype = torch.float32

cpu


# Data

In [2]:
%load_ext autoreload
%autoreload 2

import utils
from utils import load_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df_DB = load_data()
print(df_DB.shape)
df_DB.head()

(13717, 7)


Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,213,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,686,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
2,A1L3X4,DB12965,[Ag],4672,MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,462,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,4467,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1


In [4]:
df_DB[df_DB["smiles"] =="NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O"]
# drop the 2 molecules with the same smiles
df_DB = df_DB.drop(12222)
df_DB[df_DB["smiles"] =="NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O"]


Unnamed: 0,uniprot,DBid,smiles,ind2mol,fasta,ind2prot,inter


In [5]:
df_DB.shape

(13716, 7)

In [25]:
df_DB.to_csv('data/drugbank.csv', index=False)

## Kprot

In [26]:
import pickle
with open('data/drugbank_K_prot.data', 'rb') as f:
        K_prot = pickle.load(f)

In [27]:
K_prot.shape

(2513, 2513)

## liste des 4814 smiles

In [28]:
# same in zip format
import pandas as pd
import zipfile
zf = zipfile.ZipFile('data/drugbank.csv.zip') 
df = pd.read_csv(zf.open('drugbank.csv'),low_memory=False)
df_p = df[df['inter'] == True]
#list of smiles strings
smiles = df_p['smiles'].drop_duplicates().values
len(smiles)

4813

In [29]:
from rdkit import Chem
from rdkit.Chem import AllChem

import numpy as np

nM =  len(smiles)
MorganFP = np.zeros((nM,1024))
for i in range(nM):
    # Convert SMILES to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles[i])    
    # Generate Morgan fingerprint of the molecule
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    # Convert the fingerprint to a numpy array
    arr = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
    MorganFP[i,:] = arr
MorganFP = MorganFP.astype(int)

[22:22:02] Unusual charge on atom 0 number of radical electrons set to zero


In [30]:
import Nystrom_method
from  Nystrom_method import nystroem,KronKernel
# random list of molecules 
kM = 4814 # number of molecule to compute nystrom
rM = 1000 # final dimension of features
I = np.random.permutation(nM)
I = I[:kM]

In [31]:
# compute Tanimoto kernel 
Km = ( MorganFP[I,:] @ MorganFP.T ) / ( 1024 - (1-MorganFP[I,:]) @ (1-MorganFP.T) )

In [32]:
Xm,Lambda,LambdaC = nystroem(np.concatenate((Km[:,I], Km), axis=1),rM)

## liste des indices protéines/molécules avec que des 1

In [34]:
# protein indices
J = df_p['ind2prot'].values
print(len(J))
# molecules indices
I = df_p['ind2mol'].values
print(len(I))

13716
13716


## train/test avec indices protéines/molécules et interactions balanced

en premier l'indice de la protéine, puis l'indice du ligand puis l'interaction

In [29]:
# change name of the column 'ind2prot' in 'indfasta' in df
df = df_DB.rename(columns={'ind2prot': 'indfasta', 'ind2mol': 'indsmiles', 'inter': 'score'})
df.head()

Unnamed: 0,uniprot,DBid,smiles,indsmiles,fasta,indfasta,score
0,A0A024R8I1,DB00381,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,213,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
1,A0A024R8I1,DB00996,NCC1(CC(O)=O)CCCCC1,686,MVRFGDELGGRYGGPGGGERARGGGAGGAGGPGPGGLQPGQRVLYK...,0,1
2,A1L3X4,DB12965,[Ag],4672,MDLSCSCATGGSCTCASSCKCKEYKCTSCKKNCCSCCPMGCAKCAQGCT,1,1
3,A5X5Y0,DB00715,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,462,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1
4,A5X5Y0,DB09304,CN1CCC2=C(C1)C1=CC=CC=C1CC1=CC=CC=C21,4467,MEGSWFHRKRFSFYLLLGFLLQGRGVTFTINCSGFGQHGADPTALN...,2,1


In [30]:
from utils import make_train_test

all_train_interactions_arr, all_test_interactions_arr = make_train_test(df_name,5,1)

train (21944, 3)
test (5488, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
train (21946, 3)
test (5486, 3)
Train/test datasets prepared.


In [33]:
test = all_test_interactions_arr[2]
train = all_train_interactions_arr[2]
c = 0
for elt in test[:,:]:
    if elt in train[:,:]:
        c+=1
        print(elt)
print(c)
print(len(test[:,0:2]))

[ 580 2777    1]
[925 585   1]
[ 571 1018    1]
[933 640   1]
[1555 4762    1]
[1815 4596    1]
[ 863 2916    1]
[ 820 4505    1]
[321 962   1]
[ 711 4671    1]
[615  77   1]
[2228 4764    1]
[ 969 4133    1]
[1316  101    1]
[530  11   1]
[ 298 1929    1]
[1200 4252    1]
[957 462   1]
[1771  616    1]
[ 554 2794    1]
[1862 1361    1]
[1531 4079    1]
[1296 4706    1]
[ 748 2443    1]
[1020 4304    1]
[1607  572    1]
[902 373   1]
[1647  432    1]
[1153 4457    1]
[1009 4761    1]
[1928   42    1]
[2130 4655    1]
[2430 4576    1]
[849 630   1]
[1026 3638    1]
[2197 3486    1]
[ 820 2660    1]
[1023  314    1]
[ 31 563   1]
[728 397   1]
[  30 4437    1]
[ 515 4682    1]
[ 876 2478    1]
[ 580 4573    1]
[997 722   1]
[ 533 1411    1]
[ 907 1097    1]
[ 511 1171    1]
[ 997 2494    1]
[ 373 2806    1]
[399 840   1]
[ 291 2133    1]
[1828  924    1]
[1161 4473    1]
[2067 1100    1]
[1771 4693    1]
[ 261 1691    1]
[1023  203    1]
[989 902   1]
[820 236   1]
[1024  234    1]
[714 

In [111]:
# algo Matthieu corrected
intMat = df.pivot(index='indfasta', columns="indsmiles", values='score').to_numpy(dtype=np.float16)
intMat

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float16)

In [112]:
n_p,n_m = intMat.shape
Ip, Jm = np.where(intMat==1)
print(Ip,Jm,intMat[0,213])
nb_positive_inter = int(len(Ip))
Inp, Jnm = np.where(intMat==0)
Inkp, Jnkm = np.where(np.isnan(intMat))
print(Inkp,Jnkm,intMat[0,0])


[   0    0    1 ... 2512 2512 2512] [ 213  686 4671 ... 1563 1670 3947] 1.0
[   0    0    0 ... 2512 2512 2512] [   0    1    2 ... 4811 4812 4813] nan


In [117]:
from sklearn import model_selection
skf_positive = model_selection.KFold(shuffle=True, n_splits=5)
for train_index, test_index in skf_positive.split(range(nb_positive_inter)):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [    0     1     2 ... 13711 13713 13715] TEST: [    3     4    11 ... 13698 13712 13714]
TRAIN: [    0     1     3 ... 13713 13714 13715] TEST: [    2     8    16 ... 13702 13703 13708]
TRAIN: [    0     2     3 ... 13712 13713 13714] TEST: [    1     5    13 ... 13710 13711 13715]
TRAIN: [    1     2     3 ... 13713 13714 13715] TEST: [    0     6     9 ... 13704 13705 13709]
TRAIN: [    0     1     2 ... 13712 13714 13715] TEST: [    7    22    28 ... 13693 13706 13713]


In [133]:
Ip[train_index],Jm[train_index],intMat[0,213]

(array([   0,    0,    1, ..., 2512, 2512, 2512]),
 array([ 213,  686, 4671, ...,  149, 1670, 3947]),
 1.0)

In [134]:
Ip[test_index],Jm[test_index],intMat[2,4675]

(array([   2,    3,    7, ..., 2502, 2508, 2512]),
 array([4675, 1019, 1775, ..., 4596, 3430, 1563]),
 1.0)

In [119]:
Mm, bin_edges = np.histogram(Ip[train_index], bins = range(n_p+1)) 
Mp, bin_edges = np.histogram(Jm[train_index], bins = range(n_m+1))
train = np.zeros([1,3], dtype=int)
nb_prot = len(list(set(Ip[train_index])))
nb_prot

2288

In [120]:
for i in range(nb_prot):

        j = np.argmax(Mm) # choose protein with the maximum of interactions in the train

        indice_P = Jm[train_index][np.where(Ip[train_index]==j)[0]]  #np.array with index of interactions + in train
        indice_N = [k for k in Jm[train_index] if intMat[j][k]==0]
        indice_NK = [k for k in Jm[train_index] if np.isnan(intMat[j][k])] #np.array  with index of interactions not known

        indice_freq_mol = np.where(Mp>1)[0]  #drug's index with more than 2 interactions +
        indice_poss_mol = np.where(Mp == 1)[0]  #drug's index with 1 interaction +

        indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
        indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)

        nb_positive_interactions = len(indice_P)
        nb_frequent_hitters_negative_interactions = len(indice_freq_one_prot)

        indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
        indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)
        indice_freq_one_prot_NK = np.intersect1d(indice_NK, indice_freq_mol)
        indice_poss_one_prot_NK = np.intersect1d(indice_NK, indice_poss_mol)

        if len(indice_P) <= len(indice_freq_one_prot):
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            indice_N_one_prot = np.random.choice(indice_freq_one_prot,
                                                len(indice_P), replace = False)
        elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot):
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot)
            indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot,
                                                    nb_negative_interactions_remaining, replace = False )
            indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                            indice_N_one_prot_poss))
        elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK):
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot)
            indice_N_one_prot_poss = np.random.choice(indice_freq_one_prot_NK,
                                                    nb_negative_interactions_remaining, replace = False )
            indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                            indice_poss_one_prot, indice_N_one_prot_poss))
        else:
            # we shoot at random nb_positive_interactions in drugs with a lot of interactions
            nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot) - len(indice_freq_one_prot_NK)
            #print("nb_negative_interactions_remaining", nb_negative_interactions_remaining) # pas de solution...
            #print(indice_poss_one_prot_NK.shape)
            indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot_NK,
                                                    nb_negative_interactions_remaining, replace = False )
            indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                            indice_poss_one_prot, indice_freq_one_prot_NK, indice_N_one_prot_poss))

        Mp[indice_N_one_prot.astype(int)]-=1

        # this protein has been processed
        Mm[j] = 0

        indice = np.r_[indice_P,indice_N_one_prot].astype(int)
        etiquette = [x if not np.isnan(x) else 0 for x in intMat[j][indice]]
        A = np.stack((indice, etiquette), axis=-1)
        B = np.c_[np.zeros(A.shape[0])+j,A].astype(int)
        train = np.concatenate((train,B))

        train = train[1:]

In [121]:
len(train)

19659

In [122]:
# test
#test_index =  np.random.choice(test_index, int(p*len(test_index)), replace=False)
# interactions + in test
indice_P_t = np.c_[Ip[test_index],Jm[test_index], np.ones(len(test_index))].astype(int)

# interactions - in test
a = np.r_[np.c_[Inp,Jnm]] # all the zeros in the matrix (and NK ?)
a1 = set(map(tuple, a))
b = train[:,:2]   # all the interactions in the train
b1 = set(map(tuple, b))
indice_N_t = np.array(list(a1 - b1))#[:indice_P_t.shape[0],:] # we keep the same number of interactions - than interactions + in test, choosing the 0 in the matrix
#print(len(indice_N_t))

# add interactions np.nan in test

if len(indice_N_t) == 0:
    # initialization
    indice_N_t = np.array([-1, -1]).reshape(1,2)

c = np.r_[np.c_[Inkp,Jnkm]] # all the np.nan in the matrix

if len(indice_N_t) < indice_P_t.shape[0]:
    # we add some interactions - in test to have the same number of interactions + and - in test choose in the np.nan in the matrix
    k = 0
    while len(indice_N_t) < indice_P_t.shape[0]+1:
        i = np.random.randint(0, len(c))
        if tuple(c[i]) not in b1:
            indice_N_t = np.concatenate((indice_N_t, c[i].reshape(1,2)))
            k += 1

# we drop the first row of indice_N_t if is [-1, -1]
if indice_N_t[0,0] == -1:
    indice_N_t = indice_N_t[1:,:]

indice_N_t = indice_N_t[:len(indice_P_t),:]

# we add the column of 0 for the etiquette
indice_N_t = np.c_[indice_N_t, np.zeros(len(indice_N_t))].astype(int)
test = np.r_[indice_P_t,indice_N_t]

      

In [137]:
np.where([0,213,1] in train),np.where([0,213,1] in test)
[0,213,1] in train,[0,213,1] in test

(True, True)

In [141]:
np.where([2,4675,1] in train),np.where([2,4675,1] in test)
#[2,4675,1] in train,[2,4675,1] in test

((array([0]),), (array([0]),))

In [139]:
Ip[test_index], Jm[test_index]

(array([   2,    3,    7, ..., 2502, 2508, 2512]),
 array([4675, 1019, 1775, ..., 4596, 3430, 1563]))

In [140]:
for i,j in zip(Ip[train_index], Jm[train_index]):
    print(i,j)

0 213
0 686
1 4671
2 462
2 4466
2 4505
2 4607
2 4694
3 103
3 344
3 443
3 531
3 553
3 563
3 606
3 866
3 1002
3 1003
3 1007
3 1016
3 1017
3 1020
4 4368
4 4739
5 4745
6 297
7 3923
8 1412
8 1481
8 2325
9 157
9 616
9 927
10 2318
12 1076
12 1123
13 628
13 838
13 1563
13 2655
14 38
15 24
18 321
18 339
18 774
19 4540
20 2198
21 34
21 113
21 119
21 131
21 224
21 247
21 946
21 2515
21 4437
21 4440
21 4442
21 4494
22 2927
22 4367
22 4585
22 4596
22 4638
23 24
24 77
24 3748
24 4290
24 4456
25 4596
26 739
28 38
29 380
29 2974
29 4045
29 4739
30 34
30 119
30 247
30 423
30 552
30 558
30 889
30 2824
30 4438
30 4442
30 4494
31 65
31 68
31 101
31 103
31 111
31 119
31 146
31 161
31 189
31 233
31 234
31 280
31 351
31 382
31 400
31 422
31 443
31 493
31 524
31 531
31 545
31 553
31 563
31 606
31 713
31 742
31 777
31 816
31 843
31 857
31 882
31 978
31 1002
31 1003
31 1016
31 1019
31 1020
31 1076
31 1183
31 4393
31 4529
31 4577
32 4596
33 5
33 1599
33 2015
34 28
34 44
34 2069
34 2469
35 21
35 496
35 2168
35 42

In [142]:
for i,j in zip(Ip[test_index], Jm[test_index]):
    print(i,j)

2 4675
3 1019
7 1775
7 3669
11 2343
12 1570
12 1798
13 1670
13 3947
16 4596
17 1564
17 1725
21 552
21 4387
21 4438
22 77
22 2673
24 1347
24 2777
27 9
29 1419
29 4059
29 4596
30 4387
30 4437
30 4440
31 108
31 155
31 204
31 292
31 341
31 344
31 726
31 866
31 1007
31 1017
31 4576
33 52
33 3695
34 2466
36 2322
43 1097
46 4769
50 3241
50 3310
50 3545
50 4267
50 4269
51 68
51 101
51 233
51 344
51 382
51 400
51 493
51 742
51 777
51 1003
51 1183
51 4529
51 4576
58 40
61 645
61 2846
62 4596
64 4220
65 2483
65 3621
67 2223
68 2607
68 3837
71 3505
74 105
74 2742
75 3951
83 4596
84 0
85 0
85 16
86 4780
88 1458
88 2861
89 0
89 24
90 82
90 2159
91 72
91 177
91 589
91 4683
94 21
96 3952
97 977
98 297
98 1147
98 1175
98 3409
98 4596
99 4387
104 76
105 7
105 8
109 3252
114 141
115 247
115 946
115 2502
116 4608
117 640
117 2565
118 2565
119 511
119 617
120 1706
123 2023
124 38
128 50
129 1123
130 25
130 4288
132 14
132 4596
136 38
138 1383
138 1871
143 72
143 145
143 161
143 564
143 4478
144 40
147 253


In [150]:
for elt in train:
    for x in test:
        if elt[0]==x[0] and elt[1]==x[1]:
            print(elt)


In [148]:
train[:,:2]

array([[ 740,  101],
       [ 740,  103],
       [ 740,  111],
       ...,
       [2510,  419],
       [2511, 4596],
       [2511,  750]])

In [156]:
for elt in test:
    if tuple(elt[:2]) in set(map(tuple, train[:,:2])):
        print(elt)


In [157]:
S_train = set(map(tuple, train))
S_test = set(map(tuple, test))
S_train.intersection(S_test)

set()

In [101]:
c = 0
for elt in test[:,:]:
    if elt in train[:,:]:
        c+=1
        print(elt)
print(c)
print(len(test[:,0:2]))

[84  0  1]
[85  0  1]
[216   0   1]
[403   0   1]
[480   0   1]
[657   0   1]
[918   0   1]
[940   0   1]
[1134    0    1]
[1890    0    1]
[1907    0    1]
[1952    0    1]
[2279    0    1]
[2375    0    1]
[2489    0    1]
[2232 2638    0]
[1793 3132    0]
[1001 2365    0]
[ 733 4291    0]
[ 114 4154    0]
[ 261 2398    0]
[ 614 3342    0]
[2442 4009    0]
[1418 3056    0]
[ 415 3253    0]
[2200 1654    0]
[2348 4389    0]
[1995 4190    0]
[1099 3319    0]
[1196 3128    0]
[2257 3821    0]
[ 792 3877    0]
[ 346 4410    0]
[1297 4644    0]
[2445  448    0]
[ 488 3224    0]
[2318 2684    0]
[152 279   0]
[ 603 3978    0]
[1007 2917    0]
[1466 2205    0]
[875 776   0]
[1972 2302    0]
[ 565 3917    0]
[ 639 1295    0]
[1800  245    0]
[1566 1689    0]
[1593 4548    0]
[2378 4477    0]
[1965 4705    0]
[1935 2200    0]
[ 972 2548    0]
[1896 1652    0]
[2200 3058    0]
[  44 3802    0]
[2328 3915    0]
[ 519 1365    0]
[ 384 1529    0]
[2066 3944    0]
[ 194 4722    0]
[2312 2394    0]

In [104]:
intMat = df.pivot(index='indfasta', columns="indsmiles", values='score').to_numpy(dtype=np.float16)
n_p,n_m = intMat.shape
Ip, Jm = np.where(intMat==1)
nb_positive_inter = int(len(Ip))
Inp, Jnm = np.where(intMat==0)
Inkp, Jnkm = np.where(np.isnan(intMat))

Mm, bin_edges = np.histogram(Ip[train_index], bins = range(n_p+1)) # np.array with  #interactions for each protein of the train at the beginning
Mp, bin_edges = np.histogram(Jm[train_index], bins = range(n_m+1)) # np.array with  #interactions for each drugs at the beginning (how manu time it can be chosen)
train = np.zeros([1,3], dtype=int)
nb_prot = len(list(set(Ip[train_index]))) # number of different prot in train
j = np.argmax(Mm)
print(j)
indice_P = Jm[train_index][np.where(Ip[train_index]==j)[0]]
indice_P

969


array([1158, 1208, 1227, 1251, 1268, 1297, 1338, 1395, 1455, 1486, 1507,
       1546, 1585, 1623, 1638, 1642, 1646, 1671, 1797, 1823, 1854, 1885,
       1888, 1920, 1994, 2113, 2151, 2185, 2232, 2384, 2411, 2414, 2756,
       2863, 2939, 3013, 3016, 3033, 3075, 3097, 3105, 3156, 3191, 3202,
       3234, 3392, 3420, 3433, 3439, 3442, 3458, 3460, 3461, 3462, 3465,
       3466, 3467, 3499, 3507, 3511, 3517, 3520, 3567, 3569, 3570, 3598,
       3613, 3614, 3616, 3620, 3640, 3641, 3709, 3738, 3770, 3838, 3858,
       3871, 3872, 3874, 3878, 3880, 3881, 3882, 3883, 3884, 3885, 3887,
       3888, 3918, 3949, 3957, 3961, 3962, 3979, 3995, 3996, 4081, 4093,
       4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4158, 4210, 4214,
       4224, 4257])

In [105]:
indice_N = [k for k in Jm[train_index] if intMat[j][k]==0]
indice_NK = [k for k in Jm[train_index] if np.isnan(intMat[j][k])] #np.array  with index of interactions not known

indice_freq_mol = np.where(Mp>1)[0]  #drug's index with more than 2 interactions +
indice_poss_mol = np.where(Mp == 1)[0]  #drug's index with 1 interaction +

indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)

nb_positive_interactions = len(indice_P)
nb_frequent_hitters_negative_interactions = len(indice_freq_one_prot)

indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)
indice_freq_one_prot_NK = np.intersect1d(indice_NK, indice_freq_mol)
indice_poss_one_prot_NK = np.intersect1d(indice_NK, indice_poss_mol)

In [106]:
print(len(indice_P),len(indice_freq_one_prot))
print(len(indice_P), len(indice_freq_one_prot) + len(indice_poss_one_prot))
print(len(indice_P) , len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK))

112 0
112 0
112 1479


In [108]:
if len(indice_P) <= len(indice_freq_one_prot):
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    indice_N_one_prot = np.random.choice(indice_freq_one_prot,
                                        len(indice_P), replace = False)
elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot):
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot)
    indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot,
                                            nb_negative_interactions_remaining, replace = False )
    indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                    indice_N_one_prot_poss))
elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK):
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot)
    indice_N_one_prot_poss = np.random.choice(indice_freq_one_prot_NK,
                                            nb_negative_interactions_remaining, replace = False ).astype(int)
    indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                    indice_poss_one_prot, indice_N_one_prot_poss))
    print(indice_N_one_prot)
else:
    # we shoot at random nb_positive_interactions in drugs with a lot of interactions
    nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot) - len(indice_freq_one_prot_NK)
    #print("nb_negative_interactions_remaining", nb_negative_interactions_remaining) # pas de solution...
    #print(indice_poss_one_prot_NK.shape)
    indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot_NK,
                                            nb_negative_interactions_remaining, replace = False )
    indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                    indice_poss_one_prot, indice_freq_one_prot_NK, indice_N_one_prot_poss))

Mp[indice_N_one_prot.astype(int)]-=1

[4446.  684.  483. 2714. 1409. 4739. 4445. 4380. 2560. 2825.  467.  117.
 1016.   55. 2334.  320. 3252.   10.  854.   72. 1379.  522. 4499. 2688.
  606.  283.  962. 2617. 4014.  997. 1031. 1115. 1024.   93. 1009.  702.
  686.  965.  499.  655. 4430. 4255. 4720. 3357.  828. 4716.  450.  250.
  992.  339.  644. 2587. 1232.  806. 4768. 1147.   20. 3389.  938.  948.
 4513.  374.   15.  514.  364. 2489.  236.  867. 4770. 2534. 1303.  318.
  723.  206. 4477. 2689.  105. 4304. 3581. 4191.  305.  837. 2899.  134.
 4741.  749. 2429. 1506. 1531.  672.  208.  630.  988.  772. 4637. 4614.
   61.  978.   42.  991. 2809.  176.  447. 1871.  407.  559. 1120.  360.
  133. 1037.  469. 4727.]


In [109]:
Mm[j] = 0

indice = np.r_[indice_P,indice_N_one_prot].astype(int)
etiquette = [x if not np.isnan(x) else 0 for x in intMat[j][indice]]
A = np.stack((indice, etiquette), axis=-1)
B = np.c_[np.zeros(A.shape[0])+j,A].astype(int)
train = np.concatenate((train,B))

In [110]:
train

array([[   0,    0,    0],
       [ 969, 1158,    1],
       [ 969, 1208,    1],
       [ 969, 1227,    1],
       [ 969, 1251,    1],
       [ 969, 1268,    1],
       [ 969, 1297,    1],
       [ 969, 1338,    1],
       [ 969, 1395,    1],
       [ 969, 1455,    1],
       [ 969, 1486,    1],
       [ 969, 1507,    1],
       [ 969, 1546,    1],
       [ 969, 1585,    1],
       [ 969, 1623,    1],
       [ 969, 1638,    1],
       [ 969, 1642,    1],
       [ 969, 1646,    1],
       [ 969, 1671,    1],
       [ 969, 1797,    1],
       [ 969, 1823,    1],
       [ 969, 1854,    1],
       [ 969, 1885,    1],
       [ 969, 1888,    1],
       [ 969, 1920,    1],
       [ 969, 1994,    1],
       [ 969, 2113,    1],
       [ 969, 2151,    1],
       [ 969, 2185,    1],
       [ 969, 2232,    1],
       [ 969, 2384,    1],
       [ 969, 2411,    1],
       [ 969, 2414,    1],
       [ 969, 2756,    1],
       [ 969, 2863,    1],
       [ 969, 2939,    1],
       [ 969, 3013,    1],
 

In [76]:
def make_train_test(df,nb_folds):
  """
    make train and test sets
  """

  # algo Matthieu corrected
  intMat = df.pivot(index='indfasta', columns="indsmiles", values='score').to_numpy(dtype=np.float16)

  # Set the different folds
  skf_positive = model_selection.KFold(shuffle=True, n_splits=nb_folds)

  all_train_interactions_arr = []
  all_test_interactions_arr = []

  n_p,n_m = intMat.shape
  Ip, Jm = np.where(intMat==1)
  nb_positive_inter = int(len(Ip))
  Inp, Jnm = np.where(intMat==0)
  Inkp, Jnkm = np.where(np.isnan(intMat))

  for train_index, test_index in skf_positive.split(range(nb_positive_inter)):
      # 9' pour train
      #train_index = np.random.choice(train_index, int(p*len(train_index)), replace=False)

      Mm, bin_edges = np.histogram(Ip[train_index], bins = range(n_p+1)) # np.array with  #interactions for each protein of the train at the beginning

      Mp, bin_edges = np.histogram(Jm[train_index], bins = range(n_m+1)) # np.array with  #interactions for each drugs at the beginning (how manu time it can be chosen)

      train = np.zeros([1,3], dtype=int)

      nb_prot = len(list(set(Ip[train_index]))) # number of different prot in train
      for i in range(nb_prot):

          j = np.argmax(Mm) # choose protein with the maximum of interactions in the train

          indice_P = Jm[train_index][np.where(Ip[train_index]==j)[0]]  #np.array with index of interactions + in train
          indice_N = [k for k in Jm[train_index] if intMat[j][k]==0]
          indice_NK = [k for k in Jm[train_index] if np.isnan(intMat[j][k])] #np.array  with index of interactions not known

          indice_freq_mol = np.where(Mp>1)[0]  #drug's index with more than 2 interactions +
          indice_poss_mol = np.where(Mp == 1)[0]  #drug's index with 1 interaction +

          indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
          indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)

          nb_positive_interactions = len(indice_P)
          nb_frequent_hitters_negative_interactions = len(indice_freq_one_prot)

          indice_freq_one_prot = np.intersect1d(indice_N, indice_freq_mol)
          indice_poss_one_prot = np.intersect1d(indice_N, indice_poss_mol)
          indice_freq_one_prot_NK = np.intersect1d(indice_NK, indice_freq_mol)
          indice_poss_one_prot_NK = np.intersect1d(indice_NK, indice_poss_mol)

          if len(indice_P) <= len(indice_freq_one_prot):
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              indice_N_one_prot = np.random.choice(indice_freq_one_prot,
                                                  len(indice_P), replace = False)
          elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot):
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot)
              indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot,
                                                      nb_negative_interactions_remaining, replace = False )
              indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                              indice_N_one_prot_poss))
          elif len(indice_P) <= len(indice_freq_one_prot) + len(indice_poss_one_prot) + len(indice_freq_one_prot_NK):
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot)
              indice_N_one_prot_poss = np.random.choice(indice_freq_one_prot_NK,
                                                      nb_negative_interactions_remaining, replace = False )
              indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                              indice_poss_one_prot, indice_N_one_prot_poss))
          else:
              # we shoot at random nb_positive_interactions in drugs with a lot of interactions
              nb_negative_interactions_remaining = len(indice_P) - len(indice_freq_one_prot) - len(indice_poss_one_prot) - len(indice_freq_one_prot_NK)
              #print("nb_negative_interactions_remaining", nb_negative_interactions_remaining) # pas de solution...
              #print(indice_poss_one_prot_NK.shape)
              indice_N_one_prot_poss = np.random.choice(indice_poss_one_prot_NK,
                                                      nb_negative_interactions_remaining, replace = False )
              indice_N_one_prot = np.concatenate((indice_freq_one_prot,
                                              indice_poss_one_prot, indice_freq_one_prot_NK, indice_N_one_prot_poss))

          Mp[indice_N_one_prot.astype(int)]-=1

          # this protein has been processed
          Mm[j] = 0

          indice = np.r_[indice_P,indice_N_one_prot].astype(int)
          etiquette = [x if not np.isnan(x) else 0 for x in intMat[j][indice]]
          A = np.stack((indice, etiquette), axis=-1)
          B = np.c_[np.zeros(A.shape[0])+j,A].astype(int)
          train = np.concatenate((train,B))

      train = train[1:]
      all_train_interactions_arr.append(train)
      print("train", train.shape)


      # test
      #test_index =  np.random.choice(test_index, int(p*len(test_index)), replace=False)
      # interactions + in test
      indice_P_t = np.c_[Ip[test_index],Jm[test_index], np.ones(len(test_index))].astype(int)

      # interactions - in test
      a = np.r_[np.c_[Inp,Jnm]] # all the zeros in the matrix (and NK ?)
      a1 = set(map(tuple, a))
      b = train[:,:2]   # all the interactions in the train
      b1 = set(map(tuple, b))
      indice_N_t = np.array(list(a1 - b1))#[:indice_P_t.shape[0],:] # we keep the same number of interactions - than interactions + in test, choosing the 0 in the matrix
      #print(len(indice_N_t))

      # add interactions np.nan in test

      if len(indice_N_t) == 0:
          # initialization
          indice_N_t = np.array([-1, -1]).reshape(1,2)

      c = np.r_[np.c_[Inkp,Jnkm]] # all the np.nan in the matrix

      if len(indice_N_t) < indice_P_t.shape[0]:
          # we add some interactions - in test to have the same number of interactions + and - in test choose in the np.nan in the matrix
          k = 0
          while len(indice_N_t) < indice_P_t.shape[0]+1:
              i = np.random.randint(0, len(c))
              if tuple(c[i]) not in b1:
                  indice_N_t = np.concatenate((indice_N_t, c[i].reshape(1,2)))
                  k += 1

      # we drop the first row of indice_N_t if is [-1, -1]
      if indice_N_t[0,0] == -1:
          indice_N_t = indice_N_t[1:,:]

      indice_N_t = indice_N_t[:len(indice_P_t),:]

      # we add the column of 0 for the etiquette
      indice_N_t = np.c_[indice_N_t, np.zeros(len(indice_N_t))].astype(int)
      test = np.r_[indice_P_t,indice_N_t]

      all_test_interactions_arr.append(test)
      print("test", test.shape)

  print("Train/test datasets prepared.")
  return all_train_interactions_arr, all_test_interactions_arr

In [77]:
all_train_interactions_arr, all_test_interactions_arr = make_train_test(df,5)

KeyboardInterrupt: 