# The independent test set result 

# Positive taken from DeepNGlyPred Dataset 

# Negative taken from Endoplasmic Reticulum, Golgi Apparatus (GA), Cell Membrane (Cm), Extracellular (Ex) where GA, Cm, and Ex negative sites have RSA > 0.5 

In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

**Traverse to working directory where the files are stored**

In [2]:
basedir = "/home/t326h379/Cell_Mem_ER_Extrac_Protein"
os.chdir(basedir)

In [3]:
df_test = pd.read_csv("df_indepenent_test_again_done_that_has_unique_protein_and_unique_sequence.csv")

In [4]:
df_test["label"].value_counts()

0    1648
1     830
Name: label, dtype: int64

In [6]:
y_independent = np.array(df_test["label"])

df_test = df_test.iloc[:,5:]
X_independent = np.array(df_test)

print(X_independent.shape,y_independent.shape)

(2478, 1024) (2478,)


In [7]:
model = tf.keras.models.load_model("Final_GlycoBiology_ANN_Glycobiology_ER_RSA(GA_Extracell_cellmem)187.h5")

In [8]:
model.summary()

Model: "sequential_187"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_374 (Dense)            (None, 64)                65600     
_________________________________________________________________
dropout_187 (Dropout)        (None, 64)                0         
_________________________________________________________________
dense_375 (Dense)            (None, 2)                 130       
Total params: 65,730
Trainable params: 65,730
Non-trainable params: 0
_________________________________________________________________


In [9]:
Y_pred = model.predict(X_independent)
Y_pred = (Y_pred > 0.5)
y_pred = [np.argmax(y, axis=None, out=None) for y in Y_pred]
y_pred = np.array(y_pred)

cm = confusion_matrix(y_independent, y_pred)

print("1648_Matthews Correlation : ",matthews_corrcoef(y_independent, y_pred))
print()
print("Confusion Matrix : \n",cm)
print()
print("Accuracy on test set:   ",accuracy_score(y_independent, y_pred))
print()


TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]

mcc = matthews_corrcoef(y_independent, y_pred)

Sensitivity = TP/(TP+FN)

Specificity = TN/(TN+FP)

Precision = TP / (TP + FP)

print(f"Sensitivity:  {Sensitivity}")

print()

print(f"Specificity: {Specificity}")

print()

print(f"Precision:  {Precision}")

print()

print(classification_report(y_independent, y_pred))

1648_Matthews Correlation :  0.49599922865235846

Confusion Matrix : 
 [[1242  406]
 [ 195  635]]

Accuracy on test set:    0.7574656981436643

Sensitivity:  0.7650602409638554

Specificity: 0.7536407766990292

Precision:  0.6099903938520653

              precision    recall  f1-score   support

           0       0.86      0.75      0.81      1648
           1       0.61      0.77      0.68       830

    accuracy                           0.76      2478
   macro avg       0.74      0.76      0.74      2478
weighted avg       0.78      0.76      0.76      2478



# There is no redundant protein in training set and independent test set

In [14]:
import numpy as np

df = pd.read_csv("df_train_data_without_indepenent_test_and_protein.csv")

print(df["label"].value_counts())

0    15860
1     8405
Name: label, dtype: int64


In [11]:
Training_Protein_ID = set(df["PID"])

In [12]:
df_test = pd.read_csv("df_indepenent_test_again_done_that_has_unique_protein_and_unique_sequence.csv")
Independent_Test_Set_Protein_ID = set(df_test["PID"])

In [13]:
Training_Protein_ID.intersection(Independent_Test_Set_Protein_ID)

set()

# There is no redundant protein in training set and independent test set

In [15]:
Training_Protein_Sequence = set(df["Sequence"])

In [16]:
Independent_Test_Set_Sequence = set(df_test["Sequence"])

In [17]:
Training_Protein_Sequence.intersection(Independent_Test_Set_Sequence)

set()

# Thank You