In [None]:
from Scripts.essentials import *
from sklearn.metrics import PrecisionRecallDisplay, auc
    
enc = make_encoder()
enc.load_weights("Models/data_encoders/(MANUAL)MutantVsWildtype_importance.h5")

batch_size = 256
epochs = 300
lr = 0.00005

In [None]:
p = "Data/"
#limit = 10000
train_x = enc.predict(np.load(p + "train_x_MANUAL.npy"), batch_size = 128)
test_x = enc.predict(np.load(p + "test_x_MANUAL.npy"), batch_size = 128)
val_x = enc.predict(np.load(p + "val_x_MANUAL.npy"), batch_size = 128)

train_y = np.load(p + "train_y_46.npy")
test_y = np.load(p + "test_y_46.npy")
val_y = np.load(p + "val_y_46.npy")

train_lgm = np.load(p + "train_lgm.npy")
test_lgm = np.load(p + "test_lgm.npy")
val_lgm = np.load(p + "val_lgm.npy")

np.random.seed(0)
ix = np.arange(len(train_x))
np.random.shuffle(ix)
train_x = train_x[ix]
train_y = train_y[ix]
train_lgm = train_lgm[ix]

ix = np.arange(len(val_x))
np.random.shuffle(ix)
val_x = val_x[ix]
val_y = val_y[ix]
val_lgm = val_lgm[ix]

train_lgm = np.argmax(train_lgm, axis = 1)
test_lgm = np.argmax(test_lgm, axis = 1)
val_lgm = np.argmax(val_lgm, axis = 1)
train_lgm = np.where(train_lgm > 2, 0, 1)
test_lgm = np.where(test_lgm > 2, 0, 1)
val_lgm = np.where(val_lgm > 2, 0, 1)

eye = np.eye(2)

train_lgm = eye[train_lgm]
test_lgm = eye[test_lgm]
val_lgm = eye[val_lgm]
print(train_lgm)

plt.plot(train_x[:100].T)
plt.show()

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience = 4,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True
)


import pickle
plt.rcParams.update({'font.size': 40})
plt.rcParams["font.family"] = "Times New Roman"
reset_seed()
sample_model = make_split_model(lr = lr, inp_size = len(train_x[0]), out_dims = [len(train_y[0]), len(train_lgm[0])])
sample_model.summary()

hist = sample_model.fit(train_x, [train_y, train_lgm],
                          epochs = epochs,
                          batch_size = batch_size,
                          validation_data = (val_x, [val_y, val_lgm]),
                          callbacks = [early_stop]
                       )

sample_model.save_weights("Models/MANUAL_Bias_quantifier_after.h5")

sample_model.load_weights("Models/MANUAL_Bias_quantifier_after.h5")

# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(hist.history) 

# or save to csv: 
hist_csv_file = 'history_with_encoder.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)
    
y_p = sample_model.predict(test_x, batch_size = 128)
y_p_id = np.argmax(y_p[0], axis = 1)
y_p_lgm = np.argmax(y_p[1], axis = 1)

# For the id labels
y_t = np.argmax(test_y, axis = 1)
test_acc = balanced_accuracy_score(y_t, y_p_id)
print(test_acc)
gc.collect()

id_accuracies = []
lgm_accuracies = []
displays_id = []
aupr_metrics_id = []
for n in np.unique(y_t):

    # Base the AUPR on one vs. all
    temp_yt = np.where(y_t == n, 1, 0)
    temp_yp = np.where(y_p_id == n, 1, 0)
    disp = PrecisionRecallDisplay.from_predictions(temp_yt, temp_yp)
    displays_id.append(disp)
    aupr_metrics_id.append(auc(disp.recall, disp.precision))

    # Get the accuracy on individual samples
    sample_pred = y_p_id[y_t == n]
    acc = np.sum(np.where(sample_pred == n, 1, 0))/len(sample_pred)
    id_accuracies.append(np.round(acc, 2))

    # Get the accuracy on individual samples
    sample_true_lgm = np.argmax(test_lgm, axis = 1)
    sample_true_lgm = sample_true_lgm[y_t == n]
    sample_pred = y_p_lgm[y_t == n]
    acc = np.sum(np.where(sample_pred == sample_true_lgm, 1, 0))/len(sample_pred)
    lgm_accuracies.append(np.round(acc, 2))

np.save("Results/(MANUAL)FinalIDAccuracies.npy", id_accuracies)
np.save("Results/(MANUAL)FinalLGMAccuracies.npy", lgm_accuracies)

fig, ax = plt.subplots(1, figsize = (10, 10))
for n, disp in enumerate(displays_id):
    disp.plot(ax, name = n+1)
    
ax.legend(loc='center left', bbox_to_anchor=(1.04, 0.5),
          ncol=3, fancybox=True, shadow=True)

plt.savefig("Images/Features/(MANUAL)EcoderEval_ID_After.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

gc.collect()

# For the lgm labels
y_t = np.argmax(test_lgm, axis = 1)
test_acc = balanced_accuracy_score(y_t, y_p_lgm)
print(test_acc)
gc.collect()

displays_lgm = []
aupr_metrics_lgm = []
for n in np.unique(y_t):
    temp_yt = np.where(y_t == n, 1, 0)
    temp_yp = np.where(y_p_lgm == n, 1, 0)
    disp = PrecisionRecallDisplay.from_predictions(temp_yt, temp_yp)
    displays_lgm.append(disp)

    aupr_metrics_lgm.append(auc(disp.recall, disp.precision))


fig, ax = plt.subplots(1, figsize = (10, 10))
for n, disp in enumerate(displays_lgm):
    disp.plot(ax, name = n+1)

ax.legend(loc='center left', bbox_to_anchor=(1.04, 1),
          ncol=3, fancybox=True, shadow=True)

plt.savefig("Images/Features/(MANUAL)EcoderEval_LGM_After.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

In [None]:
np.save("Results/avgpr_id_after.npy", avgpr)
np.save("Results/aupr_id_after.npy", aupr_metrics_id)
np.save("Results/aupr_lgm_after.npy", aupr_metrics_lgm)

# Performance without encoder

In [None]:
gc.collect()
# Load the data without encoder transformation
p = "Data/"
train_x = np.load(p + "train_x_MANUAL.npy")
test_x = np.load(p + "test_x_MANUAL.npy")
val_x = np.load(p + "val_x_MANUAL.npy")

train_y = np.load(p + "train_y_46.npy")
test_y = np.load(p + "test_y_46.npy")
val_y = np.load(p + "val_y_46.npy")

train_lgm = np.load(p + "train_lgm.npy")
test_lgm = np.load(p + "test_lgm.npy")
val_lgm = np.load(p + "val_lgm.npy")

np.random.seed(0)
ix = np.arange(len(train_x))
np.random.shuffle(ix)
train_x = train_x[ix]
train_y = train_y[ix]
train_lgm = train_lgm[ix]

ix = np.arange(len(val_x))
np.random.shuffle(ix)
val_x = val_x[ix]
val_y = val_y[ix]
val_lgm = val_lgm[ix]

train_lgm = np.argmax(train_lgm, axis = 1)
test_lgm = np.argmax(test_lgm, axis = 1)
val_lgm = np.argmax(val_lgm, axis = 1)
train_lgm = np.where(train_lgm > 2, 0, 1)
test_lgm = np.where(test_lgm > 2, 0, 1)
val_lgm = np.where(val_lgm > 2, 0, 1)

eye = np.eye(2)

train_lgm = eye[train_lgm]
test_lgm = eye[test_lgm]
val_lgm = eye[val_lgm]
print(train_lgm)


In [None]:
plt.rcParams.update({'font.size': 40})
plt.rcParams["font.family"] = "Times New Roman"
reset_seed()
sample_model = make_split_model(lr = lr, inp_size = len(train_x[0]), out_dims = [len(train_y[0]), len(train_lgm[0])])
sample_model.summary()

sample_model.load_weights("Models\MANUAL_Bias_quantifier.h5")

y_p = sample_model.predict(test_x, batch_size = 128)
y_p_id = np.argmax(y_p[0], axis = 1)
y_p_lgm = np.argmax(y_p[1], axis = 1)

# For the id labels
y_t = np.argmax(test_y, axis = 1)
test_acc = balanced_accuracy_score(y_t, y_p_id)
print(test_acc)
gc.collect()

id_accuracies = []
lgm_accuracies = []

displays_id = []
aupr_metrics_id = []
for n in np.unique(y_t):

    # Base the AUPR on one vs. all
    temp_yt = np.where(y_t == n, 1, 0)
    temp_yp = np.where(y_p_id == n, 1, 0)
    disp = PrecisionRecallDisplay.from_predictions(temp_yt, temp_yp)
    displays_id.append(disp)
    aupr_metrics_id.append(auc(disp.recall, disp.precision))

    # Get the accuracy on individual samples
    sample_pred = y_p_id[y_t == n]
    acc = np.sum(np.where(sample_pred == n, 1, 0))/len(sample_pred)
    id_accuracies.append(np.round(acc, 2))

    # Get the accuracy on individual samples
    sample_true_lgm = np.argmax(test_lgm, axis = 1)
    sample_true_lgm = sample_true_lgm[y_t == n]
    sample_pred = y_p_lgm[y_t == n]
    acc = np.sum(np.where(sample_pred == sample_true_lgm, 1, 0))/len(sample_pred)
    lgm_accuracies.append(np.round(acc, 2))

np.save("Results/(MANUAL)FinalIDAccuracies(Before).npy", id_accuracies)
np.save("Results/(MANUAL)FinalLGMAccuracies(Before).npy", lgm_accuracies)

fig, ax = plt.subplots(1, figsize = (10, 10))
for n, disp in enumerate(displays_id):
    disp.plot(ax, name = n+1)
    
ax.legend(loc='center left', bbox_to_anchor=(1.04, 0.5),
          ncol=3, fancybox=True, shadow=True)

plt.savefig("Images/Features/(MANUAL)EcoderEval_ID_Before.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

gc.collect()


# For the lgm labels
y_t = np.argmax(test_lgm, axis = 1)
test_acc = balanced_accuracy_score(y_t, y_p_lgm)
print(test_acc)
gc.collect()

displays_lgm = []
aupr_metrics_lgm = []
for n in np.unique(y_t):
    temp_yt = np.where(y_t == n, 1, 0)
    temp_yp = np.where(y_p_lgm == n, 1, 0)
    disp = PrecisionRecallDisplay.from_predictions(temp_yt, temp_yp)
    displays_lgm.append(disp)

    aupr_metrics_lgm.append(auc(disp.recall, disp.precision))

fig, ax = plt.subplots(1, figsize = (10, 10))
for n, disp in enumerate(displays_lgm):
    disp.plot(ax, name = n+1)

ax.legend(loc='center left', bbox_to_anchor=(1.04, 1),
          ncol=1, fancybox=True, shadow=True)

plt.savefig("Images/Features/(MANUAL)EcoderEval_LGM_Before.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

In [None]:
plt.rcParams.update({'font.size': 40})
plt.rcParams["font.family"] = "Times New Roman"

avgpr = [d.average_precision for d in displays_id]
plt.boxplot([aupr_metrics_id, avgpr])
plt.ylim([0, 1])
plt.savefig("Images/Features/(MANUAL)EcoderEval_IDperLGM_Before_boxplot.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

In [None]:
np.save("Results/avgpr_id.npy", avgpr)
np.save("Results/aupr_id.npy", aupr_metrics_id)

In [None]:
gc.collect()
# Load the data without encoder transformation
p = "Data/"
train_x = np.load(p + "train_x.npy")
test_x = np.load(p + "test_x.npy")
val_x = np.load(p + "val_x.npy")

train_y = np.load(p + "train_y_46.npy")
test_y = np.load(p + "test_y_46.npy")
val_y = np.load(p + "val_y_46.npy")

train_lgm = np.load(p + "train_lgm.npy")
test_lgm = np.load(p + "test_lgm.npy")
val_lgm = np.load(p + "val_lgm.npy")

np.random.seed(0)
ix = np.arange(len(train_x))
np.random.shuffle(ix)
train_x = train_x[ix]
train_y = train_y[ix]
train_lgm = train_lgm[ix]

ix = np.arange(len(val_x))
np.random.shuffle(ix)
val_x = val_x[ix]
val_y = val_y[ix]
val_lgm = val_lgm[ix]

train_lgm = np.argmax(train_lgm, axis = 1)
test_lgm = np.argmax(test_lgm, axis = 1)
val_lgm = np.argmax(val_lgm, axis = 1)
train_lgm = np.where(train_lgm > 2, 0, 1)
test_lgm = np.where(test_lgm > 2, 0, 1)
val_lgm = np.where(val_lgm > 2, 0, 1)

eye = np.eye(2)

train_lgm = eye[train_lgm]
test_lgm = eye[test_lgm]
val_lgm = eye[val_lgm]
print(train_lgm)

In [None]:
plt.rcParams.update({'font.size': 40})
plt.rcParams["font.family"] = "Times New Roman"
reset_seed()
sample_model = make_split_model(lr = lr, inp_size = len(train_x[0]), out_dims = [len(train_y[0]), len(train_lgm[0])])
sample_model.summary()

sample_model.load_weights("Models\RAW_Bias_quantifier.h5")

y_p = sample_model.predict(test_x, batch_size = 128)
y_p_id = np.argmax(y_p[0], axis = 1)
y_p_lgm = np.argmax(y_p[1], axis = 1)

# For the id labels
y_t = np.argmax(test_y, axis = 1)
test_acc = balanced_accuracy_score(y_t, y_p_id)
print(test_acc)
gc.collect()

id_accuracies = []
lgm_accuracies = []

displays_id = []
aupr_metrics_id = []
for n in np.unique(y_t):

    # Base the AUPR on one vs. all
    temp_yt = np.where(y_t == n, 1, 0)
    temp_yp = np.where(y_p_id == n, 1, 0)
    disp = PrecisionRecallDisplay.from_predictions(temp_yt, temp_yp)
    displays_id.append(disp)
    aupr_metrics_id.append(auc(disp.recall, disp.precision))

    # Get the accuracy on individual samples
    sample_pred = y_p_id[y_t == n]
    acc = np.sum(np.where(sample_pred == n, 1, 0))/len(sample_pred)
    id_accuracies.append(np.round(acc, 2))

    # Get the accuracy on individual samples
    sample_true_lgm = np.argmax(test_lgm, axis = 1)
    sample_true_lgm = sample_true_lgm[y_t == n]
    sample_pred = y_p_lgm[y_t == n]
    acc = np.sum(np.where(sample_pred == sample_true_lgm, 1, 0))/len(sample_pred)
    lgm_accuracies.append(np.round(acc, 2))

np.save("Results/(RAW)FinalIDAccuracies(Before).npy", id_accuracies)
np.save("Results/(RAW)FinalLGMAccuracies(Before).npy", lgm_accuracies)

fig, ax = plt.subplots(1, figsize = (10, 10))
for n, disp in enumerate(displays_id):
    disp.plot(ax, name = n+1)
    
ax.legend(loc='center left', bbox_to_anchor=(1.04, 0.5),
          ncol=3, fancybox=True, shadow=True)

plt.savefig("Images/Features/(RAW)EcoderEval_ID_Before.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

gc.collect()


# For the lgm labels
y_t = np.argmax(test_lgm, axis = 1)
test_acc = balanced_accuracy_score(y_t, y_p_lgm)
print(test_acc)
gc.collect()

displays_lgm = []
aupr_metrics_lgm = []
for n in np.unique(y_t):
    temp_yt = np.where(y_t == n, 1, 0)
    temp_yp = np.where(y_p_lgm == n, 1, 0)
    disp = PrecisionRecallDisplay.from_predictions(temp_yt, temp_yp)
    displays_lgm.append(disp)

    aupr_metrics_lgm.append(auc(disp.recall, disp.precision))

fig, ax = plt.subplots(1, figsize = (10, 10))
for n, disp in enumerate(displays_lgm):
    disp.plot(ax, name = n+1)

ax.legend(loc='center left', bbox_to_anchor=(1.04, 1),
          ncol=1, fancybox=True, shadow=True)

plt.savefig("Images/Features/(RAW)EcoderEval_LGM_Before.png", format="png", transparent = True,
                    dpi = 300,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

In [None]:
np.save("Results/aupr_id_RAW.npy", aupr_metrics_id)

In [None]:
before = np.load("Results/(MANUAL)FinalIDAccuracies(Before).npy")
after = np.load("Results/(MANUAL)FinalIDAccuracies.npy")
raw_test_acc = np.load("Results/(RAW)FinalIDAccuracies(Before).npy")
plt.rcParams.update({'font.size': 30})
plt.rcParams["font.family"] = "Times New Roman"

sorting = np.argsort(after)

plt.figure(figsize = (10, 5))

plt.scatter(np.arange(len(before)), np.array(raw_test_acc)[sorting], label = "Raw data (" + str(np.round(np.mean(raw_test_acc), 2)) + ")")
plt.scatter(np.arange(len(before)), np.array(before)[sorting], label = "Before (" + str(np.round(np.mean(before), 2)) + ")")
plt.scatter(np.arange(len(before)), np.array(after)[sorting], label = "After (" + str(np.round(np.mean(after), 2)) + ")")
plt.ylim([-0.1, 1.1])
plt.legend(fontsize = 20)

plt.savefig("Images/Histories/(MANUAL)SpectrumEffectImprovement_accuracy.png", format="png", transparent = True,
                    dpi = 1000,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

In [None]:
before = np.load("Results/aupr_id.npy")
after = np.load("Results/aupr_id_after.npy")
raw_test_auc = np.load("Results/aupr_id_RAW.npy")

plt.rcParams.update({'font.size': 30})
plt.rcParams["font.family"] = "Times New Roman"

sorting = np.argsort(after)

plt.figure(figsize = (10, 5))
plt.scatter(np.arange(len(before)), np.array(raw_test_auc)[sorting], label = "Raw (" + str(np.round(np.mean(raw_test_auc), 2)) + ")")
plt.scatter(np.arange(len(before)), np.array(before)[sorting], label = "Before (" + str(np.round(np.mean(before), 2)) + ")")
plt.scatter(np.arange(len(before)), np.array(after)[sorting], label = "After (" + str(np.round(np.mean(after), 2)) + ")")
plt.ylim([-0.1, 1.1])
plt.legend(fontsize = 20)

plt.savefig("Images/Histories/(MANUAL)SpectrumEffectImprovement_aupr.png", format="png", transparent = True,
                    dpi = 1000,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()

In [None]:
before = np.load("Results/(MANUAL)FinalLGMAccuracies(Before).npy")
after = np.load("Results/(MANUAL)FinalLGMAccuracies.npy")
raw = np.load("Results/(RAW)FinalLGMAccuracies(Before).npy")
plt.rcParams.update({'font.size': 30})
plt.rcParams["font.family"] = "Times New Roman"

#sorting = np.argsort(after)

plt.figure(figsize = (10, 5))
#plt.plot(before[sorting], alpha = 0.3)
#plt.plot(after[sorting], alpha = 0.3)
plt.scatter(np.arange(len(before)), np.array(raw)[sorting], label = "Raw (" + str(np.round(np.mean(before), 2)) + ")")
plt.scatter(np.arange(len(before)), np.array(before)[sorting], label = "Before (" + str(np.round(np.mean(before), 2)) + ")")
plt.scatter(np.arange(len(before)), np.array(after)[sorting], label = "After (" + str(np.round(np.mean(after), 2)) + ")")
plt.ylim([-0.1, 1.1])
plt.legend(fontsize = 20)

plt.savefig("Images/Histories/(MANUAL)LGMAccuracyChanges.png", format="png", transparent = True,
                    dpi = 1000,
                    bbox_inches='tight',
                    pad_inches=0.5)
plt.show()