In [1]:
# Lertas Giorgos 

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from platform import python_version
python_version()

'3.8.13'

### 1. Upload data

In [2]:
# read file with interractions and remove column with side effect polypharmacy code, because we have the names
combo=pd.read_csv('data/bio-decagon-combo.csv')
combo2 = combo.drop('Polypharmacy Side Effect', axis=1)

# Also, replace blank spaces, in names, with underscore
combo2['Side Effect Name'] = combo2['Side Effect Name'].str.replace(' ','_')

# 4649441 rows × 3 columns

In [3]:
import os 

path = '/Users/giorgoslertas/Desktop/Lertas_files/data_science/Thesis/ddi_pred'
os.chdir(path)    # os.getcwd() current working file

# read files with names, that export with the code in cmd from dgl-ke
entities = pd.read_csv('dgl-ke/entities.tsv', names = ['names'], index_col=None)
relations = pd.read_csv('dgl-ke/relations.tsv', names = ['names'], index_col=None)

# read files with embeddings, that export with the code in cmd from dgl-ke
entities_emb = np.load('ckpts/TransE_ddi_pr_0/ddi_pr_TransE_entity.npy')
relations_emb = np.load('ckpts/TransE_ddi_pr_0/ddi_pr_TransE_relation.npy')

### 2. Create X

In [4]:
# 1. Select rows with top 5 relations                             # 131.231
top5 = combo2[combo2['Side Effect Name'].isin(['arterial_pressure_NOS_decreased','anaemia',
                                               'Difficulty_breathing','nausea','neumonia'])]
# 2. find number of unique rows                                   # 54.449
top5_uni = top5[['STITCH 1','STITCH 2']].drop_duplicates()
# 3. sampling 10% of the unique interractions                      # 5.445
X_names = top5_uni.sample(frac=0.1)
X_names.sort_values(by=['STITCH 1','STITCH 2'])
X_names = X_names.reset_index().drop('index', axis=1)

# 5445 rows × 2 columns

In [5]:
# function to create X values 
def export_X(X):
    
    # for each drug, add embeddings from the exported file of dgl-ke
    x1 = [entities_emb[entities[entities.names == i].index] for i in X['STITCH 1']]
    x2 = [entities_emb[entities[entities.names == i].index] for i in X['STITCH 2']]
    
    # reshape all to 2d matrix
    x1 = np.reshape(x1, (len(x1), 400))
    x2 = np.reshape(x2, (len(x2), 400))
    
    # create X values with concatenation of 2 files with interracted drugs
    x = [np.concatenate([sub1,sub2]) for sub1, sub2 in zip(x1, x2)]
    x = pd.DataFrame(np.transpose(x))
    
    return x

X ∈ R (d×m)  ...  where d is the dimensionality of the data, and m is the number of instances.

In [6]:
X = export_X(X_names) 

# 800 rows × 5445 columns

### 3. Create Y

Y ∈ {−1, 1}m×z  ...  m is the number of instances, and z the classes. 

In [7]:
Y_names = top5.set_index(['STITCH 1','STITCH 2'])['Side Effect Name'].str.get_dummies().groupby(['STITCH 1','STITCH 2']).max()
Y = Y_names.reset_index()
Y = pd.merge(X_names, Y, on=['STITCH 1', 'STITCH 2'])
Y = Y.drop(['STITCH 1', 'STITCH 2'], axis=1)
Y[Y == 0] = -1

# 5445 rows × 5 columns

### 4. Create S & S΄

collect only the relations that we use and transpose them because of  S ∈ a×z

In [8]:
# read file with biobert embeddings
bio_embs = pd.read_csv('bio_embeddings/embeddings.csv')

# Also, replace blank spaces, in names, with underscore
bio_embs['relations'] = bio_embs['relations'].str.replace(' ','_')

S = bio_embs[bio_embs['relations'].isin(['arterial_pressure_NOS_decreased','anaemia',
                                               'Difficulty_breathing','nausea','neumonia'])].T
S.columns = S.loc['relations'].values
S = S.drop(index=('relations'))
S = S[Y.columns]#.astype(float)

# 768 rows × 5 columns

In [9]:
S_absent = S[['Difficulty_breathing','anaemia']]
S_present = S.drop(['Difficulty_breathing','anaemia'], axis=1)

### 5. Create Y & Y΄ and X & X΄

In [10]:
# find indices in absent classes ['Difficulty_breathing','anaemia'], where the values are equal to 1
absent_indices = Y[(Y.Difficulty_breathing==1) | (Y.anaemia==1)].index

# create Y with absent classes
Y_absent = Y.iloc[absent_indices][['Difficulty_breathing','anaemia']]
Y_absent

Unnamed: 0,Difficulty_breathing,anaemia
0,1,-1
1,-1,1
2,1,1
3,1,1
4,1,-1
...,...,...
5438,1,1
5441,1,1
5442,-1,1
5443,1,1


In [11]:
# create X with absent classes
X_absent = X[absent_indices]
X_absent

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5429,5431,5432,5435,5437,5438,5441,5442,5443,5444
0,0.464672,-0.314897,0.459162,0.356336,-0.272028,0.364367,-0.459387,-0.389620,0.021889,0.420456,...,-0.297104,-0.307261,-0.420698,-0.208003,-0.331828,0.408908,-0.142596,-0.252212,-0.526935,-0.351854
1,-0.332346,0.092278,-0.572518,0.242962,-0.516096,-0.453739,0.254361,0.406237,0.186877,0.363436,...,-0.351879,0.098718,-0.327388,-0.202796,-0.247185,0.325644,0.041543,-0.411011,-0.507935,0.260044
2,-0.298523,-0.384005,-0.603375,0.096113,-0.453205,0.520131,-0.406442,-0.088162,-0.472998,-0.532297,...,-0.321358,0.275252,0.499072,0.179136,0.141135,0.399296,-0.314624,-0.192679,0.292399,-0.630538
3,-0.612753,0.004499,-0.234430,-0.074004,-0.212147,-0.428992,-0.610591,-0.388243,0.629810,0.464912,...,0.671540,0.616057,0.150006,0.054943,-0.410904,0.503822,-0.224172,-0.490041,-0.156797,0.128597
4,-0.256939,0.250384,0.325195,0.648651,-0.199269,-0.480494,0.316216,0.213054,0.354426,-0.205395,...,0.243060,0.174355,0.381496,0.792894,-0.072138,-0.461226,0.225239,0.688627,-0.112970,-0.287105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-0.494024,0.218015,0.339690,-0.370426,0.289296,-0.026868,-0.419028,-0.378596,0.358899,-0.335826,...,-0.307221,-0.586974,0.333825,-0.496469,0.396618,-0.194514,0.153817,0.412407,-0.447837,0.473309
796,-0.394951,-0.017122,-0.393887,0.334971,0.393481,-0.326920,-0.496641,0.154986,-0.320322,-0.497160,...,0.528875,-0.542308,-0.548182,-0.196869,0.281368,-0.183570,0.218195,-0.266860,0.323018,0.560926
797,0.124194,-0.203994,-0.361563,-0.094142,0.388628,-0.398121,0.394689,0.235866,-0.495860,-0.093100,...,0.357391,0.411169,0.203907,0.161661,-0.280192,-0.274901,-0.108911,-0.400689,-0.232042,-0.351448
798,0.458353,0.302189,0.461664,0.419487,-0.453386,0.163642,-0.571448,-0.266955,-0.160613,-0.471359,...,-0.119553,0.573955,0.492984,0.149746,0.293923,-0.341554,0.400523,0.345429,-0.331318,0.453213


In [12]:
# create Y with present classes
Y_present = Y.drop(absent_indices, axis=0).drop(['Difficulty_breathing','anaemia'], axis=1)
Y_present

Unnamed: 0,arterial_pressure_NOS_decreased,nausea,neumonia
10,-1,1,-1
12,1,-1,-1
17,-1,-1,1
19,1,1,1
20,-1,-1,1
...,...,...,...
5433,-1,1,-1
5434,1,1,1
5436,-1,-1,1
5439,-1,-1,1


In [13]:
# create X with present classes
X_present = X.drop(absent_indices, axis=1)
X_present

Unnamed: 0,10,12,17,19,20,24,28,30,35,36,...,5407,5412,5414,5416,5430,5433,5434,5436,5439,5440
0,-0.347306,-0.091214,-0.477806,-0.422746,0.446162,-0.347306,-0.248723,0.408908,0.504528,-0.318972,...,-0.272028,-0.116360,-0.470589,0.343367,-0.347306,-0.619363,-0.220812,-0.438605,-0.282444,0.453589
1,-0.496041,-0.160929,-0.439939,0.252483,-0.471062,-0.496041,-0.315339,0.325644,0.339990,0.322328,...,-0.516096,0.450025,-0.106396,0.283212,-0.496041,0.519845,0.470905,-0.093722,-0.515941,0.338686
2,0.290017,-0.458237,0.442342,-0.320146,0.366756,0.290017,-0.089503,0.399296,0.527864,0.272748,...,-0.453205,-0.437332,0.385922,-0.355676,0.290017,0.529838,-0.276383,0.238314,0.400029,-0.444832
3,-0.119854,-0.350703,-0.780236,0.066513,0.259198,-0.119854,-0.472256,0.503822,-0.796864,-0.309422,...,-0.212147,-0.599229,-0.330790,-0.294719,-0.119854,0.260737,-0.021006,-0.450817,0.223309,-0.351513
4,-0.192315,0.456550,0.251816,0.257497,0.609998,-0.192315,0.568488,-0.461226,0.727208,0.466694,...,-0.199269,0.549738,0.016226,0.054246,-0.192315,0.296729,0.278277,-0.070858,0.032991,-0.499534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.473309,0.548562,-0.237944,-0.536977,-0.282231,0.289296,0.412088,-0.216825,0.297180,-0.380091,...,0.512352,-0.512266,-0.356547,0.481997,-0.510848,0.277576,-0.516375,0.297180,-0.185005,-0.492376
796,0.560926,0.529435,0.244034,0.342924,0.336993,0.393481,-0.575892,0.405999,0.188465,0.584888,...,-0.139032,-0.488212,-0.361671,-0.391699,0.527595,0.436122,-0.470645,0.188465,-0.250576,-0.173310
797,-0.351448,-0.472381,0.553183,0.494578,0.068021,0.388628,0.513421,-0.196260,0.445459,-0.509336,...,0.315502,0.370733,0.128980,0.074495,0.274862,-0.261379,-0.487783,0.445459,0.148588,0.472992
798,0.453213,0.170958,0.353549,0.004981,-0.102154,-0.453386,0.284746,0.134109,0.170731,-0.011371,...,0.583152,-0.511665,-0.571247,-0.444193,0.435389,0.439264,-0.533012,0.170731,-0.551854,-0.679026


### 6. function for V

W = V * S.T, where V ∈ R(d×a) (d=800(dd_emb->400*2), a=768(biobert embeddings))

In [14]:
# function to calculate V values
def calculate_V(X,Y,S,gamma=1,lamda=1):
    
    # create I table with len(X) dimensions. I table has on the diagonal of the ace and the other numbers are 0
    I = np.eye(len(X))
    part_1 = np.linalg.inv((X.dot(X.T)+gamma*I))

    # fix columns be in same row as Y columns
    S = S[Y.columns].astype(float)
    part_2 = X.dot(Y).dot(S.T)
    
    # create I table with len(S) dimensions. I table has on the diagonal of the ace and the other numbers are 0
    I = np.eye(len(S))
    part_3 = np.linalg.inv((S.dot(S.T)+lamda*I))
    
    # calculate V
    V = part_1.dot(part_2).dot(part_3)
    V = pd.DataFrame(V)
    
    return V

In [None]:
V = calculate_V(X,Y,S)
V


In [None]:
_absent

### 6. NN

In [10]:
test = Y.replace(-1, 0)
test

Unnamed: 0,Difficulty_breathing,anaemia,arterial_pressure_NOS_decreased,nausea,neumonia
0,1,1,1,0,1
1,0,1,0,1,1
2,1,0,1,1,1
3,1,0,0,1,0
4,1,0,0,0,1
...,...,...,...,...,...
5440,0,1,1,1,1
5441,0,0,0,0,1
5442,0,0,0,0,1
5443,0,0,0,0,1


In [11]:
# export X values from the sample
x = export_X(X_names)

# calculate the preds from the sample
train = x.T.dot(V).dot(S.values)
train.columns = test.columns
train

Unnamed: 0,Difficulty_breathing,anaemia,arterial_pressure_NOS_decreased,nausea,neumonia
0,0.240371,0.387552,0.653846,0.418786,0.48584
1,-1.023196,0.527432,-0.457647,-0.016962,0.779135
2,0.487877,-0.725605,-0.196367,0.257692,0.187726
3,-0.577157,-0.346602,-0.275869,-0.19566,-0.205091
4,-0.11915,0.173686,-0.561268,-0.121032,0.234029
...,...,...,...,...,...
5440,-0.42399,0.287011,0.279587,-0.319091,0.191344
5441,0.369682,0.335534,0.215661,-0.051906,0.247123
5442,0.214373,-0.215677,-0.127926,-0.400279,0.257583
5443,0.138964,0.16955,0.067887,0.11868,0.259026


In [12]:
from sklearn.model_selection import train_test_split

# create train and test
train = train.to_numpy().astype('float32')
test = test.to_numpy().astype('int32')

# set aside 30% of train and test data for evaluation
X_train, X_test, y_train, y_test = train_test_split(train, test,
                                                    test_size=0.3, shuffle = True, random_state = 8)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.15, random_state= 8) # 0.15 x 0.7 = 0.105, 10.5% of data

                                                                                   # 0.85 x 0.7 = 0.105, 59.5% of data
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

X_train shape: (3239, 5)
y_train shape: (3239, 5)
X_test shape: (1634, 5)
y_test shape: (1634, 5)
X_val shape: (572, 5)
y_val shape: (572, 5)


In [None]:
from keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# create neural network model to predict non-exclusive classes
def nn_model():
    model = Sequential()
    model.add(Dense(units=5,activation='relu'))
    model.add(Dense(units=15,activation='relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(units=10,activation='relu'))
    model.add(Dense(units=5,activation='sigmoid'))

    
    # For a binary classification problem
    model.compile(optimizer='adam', loss='binary_crossentropy') # metrics=['accuracy']
    
    return model

In [None]:
# simple early stopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

model = nn_model()
# fit model
model.fit(X_train, y_train, batch_size = 32, epochs = 150, verbose = 1, 
          validation_data = (X_test, y_test))#, callbacks=[early_stop])

In [None]:
# Here we can see parameters and output shapes in each layer
model.summary()

In [None]:
model.metrics_names

In [None]:
# model comparison of loss and accuracy between train and test 
model_loss = pd.DataFrame(model.history.history)
#model_loss[['accuracy','val_accuracy']].plot(figsize=(12,5), title='Compare accuracy of the model');
model_loss[['loss','val_loss']].plot(figsize=(12,5), title='Compare loss of the model');

In [None]:
training_score = model.evaluate(X_train,y_train,verbose=0)
test_score = model.evaluate(X_val, y_val, verbose = 0 )

print("Train loss: ", training_score)
print("Validation loss: ", test_score)
#print("Train accuracy: ", training_score[1])
#print("Test accuracy: ", test_score[1])

In [None]:
y_val[:9,:]

In [None]:
predictions[:9,:]