# Python Processing Step for Multilabel FAIMS Data

### Packages

In [None]:
#Load all my packages
import pandas as pd
import numpy as np
import numpy as np
from numpy import array
from numpy import argmax         #finds the index of the maximum value in a vector
import os
import sklearn
import sklearn.ensemble
import skmultilearn
from scipy import sparse
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from skmultilearn.model_selection import  iterative_train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# stuff for exploring the classes
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.dataset import load_dataset
from collections import Counter
from skmultilearn.model_selection import iterative_train_test_split, iterative_stratification
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.cluster.networkx import NetworkXLabelGraphClusterer
from skmultilearn.cluster.igraph import IGraphLabelGraphClusterer
import igraph as ig

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print(tf.config.list_physical_devices('GPU'))

# device = "CPU"  # if no GPU
device = "GPU:0"


### Use this for ecoli data
# data_path = "data/cvmax_singlelabel.csv"  # data with only one label
# df = pd.read_csv(data_path, low_memory=False) #read in data generated from R preprocessing
# y = df.loc[ : ,  'X20':'X95'].values

### Human data
data_path = "data/NEW_JMMdata_maxCVvalues.txt"
df = pd.read_csv(data_path, low_memory=False, sep="\t") #read in data generated from R preprocessing
cols = df.columns
new_cols = []
for c in cols:
    if c.isnumeric():
        new_cols.append("X" + str(c))
    elif c == "z_modseq":
        new_cols.append("SeqCharge")
    else:
        new_cols.append(c)
df.columns = new_cols

xcols = [i for i in df.columns if i.startswith("X")]
xcols_idx = [i for i, c in enumerate(df.columns) if c.startswith("X")]

In [None]:
np.argmax(df.iloc[0,xcols_idx])
df[xcols] = 0
for i in range(df.shape[0]):
    r = df.iloc[i,:]
    cvmax = r["maxcv_naomit"]
    cvmax_str = "X" + str(cvmax)
    df.loc[r.name, cvmax_str] = 1
    # if i % 1000 == 0:
    #     print(i / df.shape[0])
y = df.loc[ : ,  'X20':'X95'].values

In [None]:
df.columns

In [None]:
# df.to_csv("data/new_jmm_singlelabel.csv")

### Processing

In [None]:
#Bringing in the the final labelling scheme data and adding the other features

# df = pd.read_csv(data_path, low_memory=False) #read in data generated from R preprocessing
# y = df.loc[ : ,  'X20':'X95'].values

In [None]:
df.head()

In [None]:
### combine all the letters into a long string, take the set to find the unique values, add 'END' (for use with one-hot), then get length
seq = df['SeqCharge']
vocab = set(''.join([str(i) for i in seq]))
vocab.add('END')
len_vocab = len(vocab)
print(len_vocab)

In [None]:
cv = df['maxcv_naomit']
set(cv)

In [None]:
## make index of the characters in vocab
char_index = dict((c, i) for i, c in enumerate(vocab))
maxlen = max([len(x) for x in df.SeqCharge])
print(char_index)
print(maxlen)

In [None]:
#take input upto max and truncate rest
# get index in char_index
#padd 'END' to shorter sequences

x = []
x_name = [str(i)[0:maxlen] for i in seq]
for i in x_name:
    tmp = [char_index[j] for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(char_index["END"])
    x.append(tmp)

In [None]:
#Split the 50%+ threshold data into train and test keeping label distribution proportional
# X_train, y_train, X_test, y_test = iterative_train_test_split(np.asarray(x), y, test_size=0.30)

# Do stratified split of data
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
sss.get_n_splits(np.asarray(x), df["maxcv_naomit"])
train_idx, test_idx = sss.split(x,df["maxcv_naomit"]).__next__()
train_idx = list(train_idx)
test_idx = list(test_idx)
X_train = np.asarray(x)[train_idx, :]
X_test = np.asarray(x)[test_idx, :]
y_train = y[train_idx, :]
y_test = y[test_idx, :]
# X_train, y_train, X_test, y_test = iterative_train_test_split(np.asarray(x), y, test_size=0.30)
# X_train.shape

In [None]:
xcols.index("X65")

In [None]:
np.sum(y_test[:, 9])  # see how many values are at 65

In [None]:
y.shape

In [None]:
y

# Model

In [None]:
len(set(cv))

In [None]:
def macro_f1(y, y_hat, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)
    
    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive
        
    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y, axis=0), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    macro_f1 = tf.reduce_mean(f1)
    return macro_f1

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
with tf.device("GPU:0"):
    model = tf.keras.Sequential()
    model.add(layers.Embedding(input_dim=51, output_dim=40))
    model.add(layers.LSTM(128, return_sequences=True, input_shape=(maxlen,len_vocab)))
    model.add(layers.Dropout(0.2))
    model.add(layers.LSTM(128, return_sequences=False))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(len(set(cv)), activation='sigmoid'))
    adam = tf.keras.optimizers.Adam(
        learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
        name='Adam')
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=adam,  metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model.summary()

In [None]:
with tf.device("GPU:0"):
    Xtf_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
    ytf_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
    
    Xtf_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
    ytf_test = tf.convert_to_tensor(y_test, dtype=tf.float32)

In [None]:
ytf_train

In [None]:
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)

hist = model.fit(Xtf_train, ytf_train, epochs=250, batch_size=2048, validation_data=(Xtf_test, ytf_test), callbacks=[earlystop])

In [None]:
plt.plot(hist.history['loss'], label="Training")
plt.plot(hist.history['val_loss'], label="Validation")
plt.legend()
plt.title('Model performance', fontsize=26)
plt.xlabel("Epoch", fontsize=20)
plt.ylabel("Binary Cross-Entropy", fontsize=20)
plt.savefig("modelperformance_binarycrossentropy_singlelabel_human.png")
plt.savefig("modelperformance_binarycrossentropy_singlelabel_human.svg")

In [None]:
plt.plot(hist.history['binary_accuracy'], label="Training")
plt.plot(hist.history['val_binary_accuracy'], label="Validation")  ### waaaaay overfitting
plt.legend()
plt.title('Model performance', fontsize=26)
plt.xlabel("Epoch", fontsize=20)
plt.ylabel("Binary Accuracy", fontsize=20)
plt.savefig("modelperformance_binaryaccuracy_singlelabel_human.png")
plt.savefig("modelperformance_binaryaccuracy_singlelabel_human.svg")

In [None]:
preds = model.predict(X_test)

In [None]:
preds_label = np.argmax(preds, axis=1)

In [None]:
ytest_label = np.argmax(y_test, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

In [None]:
import matplotlib.patches

In [None]:
len(X_test)

In [None]:
len(X_train)

In [None]:
fig, ax = plt.subplots(1,1)
xcols = [i[1:] for i in df.columns if i.startswith("X")]
im0 = ax.imshow(confusion_matrix(ytest_label, preds_label, normalize='pred'), origin="upper")
plt.xticks(np.arange(16), xcols)
plt.yticks(np.arange(16), xcols)
plt.xlabel("True label", fontsize=20)
plt.ylabel("Predicted label", fontsize=20)
fig.colorbar(im0, ax=ax)
# Create a square patch
for i in range(16):
    rect = matplotlib.patches.Rectangle((-0.5+i, -0.5+i), 1, 1, linewidth=2, edgecolor='k', facecolor='none')
    ax.add_patch(rect)
# Add the patch to the plot
plt.title("Single-label prediction", fontsize=26)
plt.savefig("singlelabel_confusion_human.svg")
plt.savefig("singlelabel_confusion_human.png")

In [None]:
pwd

In [None]:
fig, ax = plt.subplots(1,1)
xcols = [i[1:] for i in df.columns if i.startswith("X")]
im0 = ax.imshow(confusion_matrix(ytest_label, preds_label, normalize='pred'), origin="lower")
plt.xticks(np.arange(16), xcols)
plt.yticks(np.arange(16), xcols)
plt.xlabel("True label", fontsize=20)
plt.ylabel("Predicted label", fontsize=20)
fig.colorbar(im0, ax=ax)
# Create a square patch
for i in range(16):
    rect = matplotlib.patches.Rectangle((-0.5+i, -0.5+i), 1, 1, linewidth=2, edgecolor='k', facecolor='none')
    ax.add_patch(rect)
# Add the patch to the plot
plt.title("Single-label prediction", fontsize=26)
plt.savefig("singlelabel_confusion_inverted_human.svg")
plt.savefig("singlelabel_confusion_inverted_human.png")

In [None]:
fig, ax = plt.subplots(1,1)
xcols = [i[1:] for i in df.columns if i.startswith("X")]
im0 = ax.imshow(confusion_matrix(ytest_label, preds_label, normalize='true'), origin="lower")
plt.xticks(np.arange(16), xcols)
plt.yticks(np.arange(16), xcols)
plt.xlabel("True label", fontsize=20)
plt.ylabel("Predicted label", fontsize=20)
fig.colorbar(im0, ax=ax)
# Create a square patch
for i in range(16):
    rect = matplotlib.patches.Rectangle((-0.5+i, -0.5+i), 1, 1, linewidth=2, edgecolor='k', facecolor='none')
    ax.add_patch(rect)
# Add the patch to the plot
plt.title("Single-label prediction", fontsize=26)
# plt.savefig("singlelabel_confusion_inverted_human.svg")
# plt.savefig("singlelabel_confusion_inverted_human.png")