In [92]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, roc_auc_score, roc_curve
import tensorflow as tf
import matplotlib as plt

In [93]:
# Loading in the Data & splitting it into variables 
data = np.load('disprot_esm_embed_10188.npz')
# feature data
x_data = data['X']
# ground truth 
y_data = data['y']
x_data.shape

(10188, 1280)

In [94]:
# splitting the data into training and testing data 
x_train, x_test, y_train, y_test = train_test_split(
                                   x_data, y_data, test_size = 0.20, 
                                   random_state = 42)

# Failed initial Iteration 
```
ogModel = tf.keras.Sequential() 
ogModel.add(tf.keras.layers.Dense(units=25, input_shape=(1280,), activation='relu'))
```
* failure point over compression of features 
    * makes the model inconsistent as depending on how it compresses 1280 features into 25 features it could have a model loss of 4.0 or 0.6
```
ogModel.add(tf.keras.layers.Dense(units=15, activation='sigmoid'))
ogModel.add(tf.keras.layers.Dense(units=15, activation='relu'))
ogModel.add(tf.keras.layers.Dense(units=1, activation='relu'))
```
* short comming of the model with the over compression at the first step having only 4 layers isn't enough to accurately capture the realtionships for prediction 

# Intermediate Model

```
### initial number of features: 1280
myModel = tf.keras.Sequential()
### 3/4 compression 
myModel.add(tf.keras.layers.Dense(units = 640, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 480, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 360, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 270, activation = 'sigmoid'))
### 2/3 compression
myModel.add(tf.keras.layers.Dense(units = 180, activation = 'relu'))
### 1/3 compression
myModel.add(tf.keras.layers.Dense(units = 60, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 20, activation = 'sigmoid'))
### 1/4 compression
myModel.add(tf.keras.layers.Dense(units = 5, activation = 'relu'))
### Output layer 
myModel.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
```


# Final Model

In [95]:
# initial number of features: 1280
myModel = tf.keras.Sequential()
# 1/2 compression 
myModel.add(tf.keras.layers.Dense(units = 640, activation = 'relu'))
# 200 units per compression
myModel.add(tf.keras.layers.Dense(units = 440, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 240, activation = 'sigmoid'))
myModel.add(tf.keras.layers.Dense(units = 40, activation = 'relu'))
# 1/2 compression rounded
myModel.add(tf.keras.layers.Dense(units = 20, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 10, activation = 'relu'))
# Output layer 
myModel.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [96]:
myModel.compile(loss='binary_crossentropy')

In [97]:
myModel.fit(x_train, y_train, class_weight={0 : 1.0, 1 : 1.55})

[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.6015


<keras.src.callbacks.history.History at 0x351893680>

In [98]:
predictionRaw = myModel.predict(x_test) 
print(predictionRaw)
predictionProbability = predictionRaw.flatten()
prediction = predictionProbability.round()

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[[0.02321588]
 [0.22129591]
 [0.75286645]
 ...
 [0.01557916]
 [0.33385763]
 [0.9327468 ]]


In [99]:
test_y = pd.DataFrame(y_test)

In [100]:
performanceDataFrame = pd.DataFrame(
                       data = {
                               'Prediction':prediction,
                               'Prediction_Probability':predictionProbability,
                               'Ground_Truth':test_y[0]
                               })

performanceDataFrame['90_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.90).astype(int)
performanceDataFrame['10_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.10).astype(int)

In [101]:
display(performanceDataFrame)
display(performanceDataFrame[performanceDataFrame['Prediction_Probability'] >= .90].shape)

Unnamed: 0,Prediction,Prediction_Probability,Ground_Truth,90_Probability,10_Probability
0,0.0,0.023216,0.0,0,0
1,0.0,0.221296,1.0,0,1
2,1.0,0.752866,1.0,0,1
3,0.0,0.029147,0.0,0,0
4,1.0,0.875515,1.0,0,1
...,...,...,...,...,...
2033,1.0,0.637525,1.0,0,1
2034,1.0,0.777398,1.0,0,1
2035,0.0,0.015579,0.0,0,0
2036,0.0,0.333858,0.0,0,1


(274, 5)

In [102]:
crosstabpred = pd.crosstab(performanceDataFrame['Prediction'],
                           performanceDataFrame['Ground_Truth'], margins= False)

crosstab10 = pd.crosstab(performanceDataFrame['10_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

crosstab90 = pd.crosstab(performanceDataFrame['90_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

In [103]:
display(crosstabpred, crosstab10, crosstab90)

Ground_Truth,0.0,1.0
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1411,108
1.0,45,474


Ground_Truth,0.0,1.0
10_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1043,13
1,413,569


Ground_Truth,0.0,1.0
90_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1452,312
1,4,270


# Additional Metrics 

In [104]:
# ROC Curve & AUC
falsePos, truePos, thresh = roc_curve(y_test, prediction)
areaUnderCurve = roc_auc_score(y_test, prediction)
precision = precision_score(y_test, prediction)
print(f'False Positive rate of {falsePos[1]:.4f} \nTrue Positive rate {truePos[1]:.4f}')
print(f'\nPrecision Score of {precision:.4f} \nArea Under Curve {areaUnderCurve:.4f}')


False Positive rate of 0.0309 
True Positive rate 0.8144

Precision Score of 0.9133 
Area Under Curve 0.8918


### **General prediction** has an improved false positive rate and a improved false negative rate 
* This is the final model but further improvements in the output could be made with a larger vector or better tuning 
### **10% probability** has a very low false negatives but a very high false positive rate
* too sensitive not enough specificity 
### **90% probability** has next to no false positives but has too many false negatives to be useable
* need to increase sensitivity even at the cost of specificity 

In [105]:
# Try a U-Net or RN network to compare?