In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, roc_auc_score, roc_curve
import tensorflow as tf
import matplotlib as plt

In [2]:
# Loading in the Data & splitting it into variables 
data = np.load('disprot_esm_embed_10188.npz')
# feature data
x_data = data['X']
# ground truth 
y_data = data['y']
x_data.shape

(10188, 1280)

In [3]:
# splitting the data into training and testing data 
x_train, x_test, y_train, y_test = train_test_split(
                                   x_data, y_data, test_size = 0.20, 
                                   random_state = 42)

# Failed initial Iteration 
```
ogModel = tf.keras.Sequential() 
ogModel.add(tf.keras.layers.Dense(units=25, input_shape=(1280,), activation='relu'))
```
* failure point over compression of features 
    * makes the model inconsistent as depending on how it compresses 1280 features into 25 features it could have a model loss of 4.0 or 0.6
```
ogModel.add(tf.keras.layers.Dense(units=15, activation='sigmoid'))
ogModel.add(tf.keras.layers.Dense(units=15, activation='relu'))
ogModel.add(tf.keras.layers.Dense(units=1, activation='relu'))
```
* short comming of the model with the over compression at the first step having only 4 layers isn't enough to accurately capture the realtionships for prediction 

In [82]:
ogModel = tf.keras.Sequential() 
ogModel.add(tf.keras.layers.Dense(units=25, input_shape=(1280,), activation='relu'))
ogModel.add(tf.keras.layers.Dense(units=15, activation='sigmoid'))
ogModel.add(tf.keras.layers.Dense(units=15, activation='relu'))
ogModel.add(tf.keras.layers.Dense(units=1, activation='relu'))


In [83]:
ogModel.compile(loss='binary_crossentropy')
ogModel.fit(x_train, y_train)

[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 669us/step - loss: 0.5310


<keras.src.callbacks.history.History at 0x36343ba70>

In [84]:
predictionRaw = ogModel.predict(x_test) 
print(predictionRaw)
predictionProbability = predictionRaw.flatten()
prediction = predictionProbability.round()

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 722us/step
[[0.09012659]
 [0.4610371 ]
 [0.7006131 ]
 ...
 [0.        ]
 [0.39316362]
 [1.015807  ]]


In [85]:
test_y = pd.DataFrame(y_test)
performanceDataFrame = pd.DataFrame(
                       data = {
                               'Prediction':prediction,
                               'Prediction_Probability':predictionProbability,
                               'Ground_Truth':test_y[0]
                               })

performanceDataFrame['90_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.90).astype(int)
performanceDataFrame['10_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.10).astype(int)

display(performanceDataFrame)
display(performanceDataFrame[performanceDataFrame['Prediction_Probability'] >= .90].shape)

Unnamed: 0,Prediction,Prediction_Probability,Ground_Truth,90_Probability,10_Probability
0,0.0,0.090127,0.0,0,0
1,0.0,0.461037,1.0,0,1
2,1.0,0.700613,1.0,0,1
3,0.0,0.063081,0.0,0,0
4,1.0,0.603847,1.0,0,1
...,...,...,...,...,...
2033,1.0,0.514094,1.0,0,1
2034,1.0,0.947070,1.0,1,1
2035,0.0,0.000000,0.0,0,0
2036,0.0,0.393164,0.0,0,1


(207, 5)

In [86]:
crosstabpred = pd.crosstab(performanceDataFrame['Prediction'],
                           performanceDataFrame['Ground_Truth'], margins= False)

crosstab10 = pd.crosstab(performanceDataFrame['10_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

crosstab90 = pd.crosstab(performanceDataFrame['90_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

display(crosstabpred, crosstab10, crosstab90)

Ground_Truth,0.0,1.0
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1385,175
1.0,71,407


Ground_Truth,0.0,1.0
10_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,727,15
1,729,567


Ground_Truth,0.0,1.0
90_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1454,377
1,2,205


In [87]:
# AUC & Percision Calculations 
falsePos, truePos, thresh = roc_curve(y_test, prediction)
areaUnderCurve = roc_auc_score(y_test, prediction)
precision = precision_score(y_test, prediction)
print(f'False Positive rate of {falsePos[1]:.4f} \nTrue Positive rate {truePos[1]:.4f}')
print(f'\nPrecision Score of {precision:.4f} \nArea Under Curve {areaUnderCurve:.4f}')

False Positive rate of 0.0488 
True Positive rate 0.6993

Precision Score of 0.8515 
Area Under Curve 0.8253


# Intermediate Model

```
### initial number of features: 1280
myModel = tf.keras.Sequential()
### 3/4 compression 
myModel.add(tf.keras.layers.Dense(units = 640, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 480, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 360, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 270, activation = 'sigmoid'))
### 2/3 compression
myModel.add(tf.keras.layers.Dense(units = 180, activation = 'relu'))
### 1/3 compression
myModel.add(tf.keras.layers.Dense(units = 60, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 20, activation = 'sigmoid'))
### 1/4 compression
myModel.add(tf.keras.layers.Dense(units = 5, activation = 'relu'))
### Output layer 
myModel.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
```


In [98]:
### initial number of features: 1280
intModel = tf.keras.Sequential()
### 3/4 compression 
intModel.add(tf.keras.layers.Dense(units = 640, activation = 'relu'))
intModel.add(tf.keras.layers.Dense(units = 480, activation = 'relu'))
intModel.add(tf.keras.layers.Dense(units = 360, activation = 'relu'))
intModel.add(tf.keras.layers.Dense(units = 270, activation = 'sigmoid'))
### 2/3 compression
intModel.add(tf.keras.layers.Dense(units = 180, activation = 'relu'))
### 1/3 compression
intModel.add(tf.keras.layers.Dense(units = 60, activation = 'relu'))
intModel.add(tf.keras.layers.Dense(units = 20, activation = 'sigmoid'))
### 1/4 compression
intModel.add(tf.keras.layers.Dense(units = 5, activation = 'relu'))
### Output layer 
intModel.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [99]:
intModel.compile(loss='binary_crossentropy')
intModel.fit(x_train, y_train)

[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.5527


<keras.src.callbacks.history.History at 0x360ae9250>

In [100]:
predictionRaw = intModel.predict(x_test) 
print(predictionRaw)
predictionProbability = predictionRaw.flatten()
prediction = predictionProbability.round()

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[0.0988265 ]
 [0.5378324 ]
 [0.6573593 ]
 ...
 [0.09811343]
 [0.55882215]
 [0.6610931 ]]


In [101]:
test_y = pd.DataFrame(y_test)
performanceDataFrame = pd.DataFrame(
                       data = {
                               'Prediction':prediction,
                               'Prediction_Probability':predictionProbability,
                               'Ground_Truth':test_y[0]
                               })

performanceDataFrame['90_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.90).astype(int)
performanceDataFrame['10_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.10).astype(int)

display(performanceDataFrame)
display(performanceDataFrame[performanceDataFrame['Prediction_Probability'] >= .90].shape)

Unnamed: 0,Prediction,Prediction_Probability,Ground_Truth,90_Probability,10_Probability
0,0.0,0.098826,0.0,0,0
1,1.0,0.537832,1.0,0,1
2,1.0,0.657359,1.0,0,1
3,0.0,0.101014,0.0,0,1
4,1.0,0.621434,1.0,0,1
...,...,...,...,...,...
2033,1.0,0.658178,1.0,0,1
2034,1.0,0.660614,1.0,0,1
2035,0.0,0.098113,0.0,0,0
2036,1.0,0.558822,0.0,0,1


(0, 5)

In [102]:
crosstabpred = pd.crosstab(performanceDataFrame['Prediction'],
                           performanceDataFrame['Ground_Truth'], margins= False)

crosstab10 = pd.crosstab(performanceDataFrame['10_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

crosstab90 = pd.crosstab(performanceDataFrame['90_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

display(crosstabpred, crosstab10, crosstab90)

Ground_Truth,0.0,1.0
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1330,148
1.0,126,434


Ground_Truth,0.0,1.0
10_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,596,42
1,860,540


Ground_Truth,0.0,1.0
90_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1456,582


In [103]:
# AUC & Percision Calculations 
falsePos, truePos, thresh = roc_curve(y_test, prediction)
areaUnderCurve = roc_auc_score(y_test, prediction)
precision = precision_score(y_test, prediction)
print(f'False Positive rate of {falsePos[1]:.4f} \nTrue Positive rate {truePos[1]:.4f}')
print(f'\nPrecision Score of {precision:.4f} \nArea Under Curve {areaUnderCurve:.4f}')

False Positive rate of 0.0865 
True Positive rate 0.7457

Precision Score of 0.7750 
Area Under Curve 0.8296


# Final Model

In [111]:
# initial number of features: 1280
myModel = tf.keras.Sequential()
# 1/2 compression 
myModel.add(tf.keras.layers.Dense(units = 640, activation = 'relu'))
# 200 units per compression
myModel.add(tf.keras.layers.Dense(units = 440, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 240, activation = 'sigmoid'))
myModel.add(tf.keras.layers.Dense(units = 40, activation = 'relu'))
# 1/2 compression rounded
myModel.add(tf.keras.layers.Dense(units = 20, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 10, activation = 'relu'))
# Output layer 
myModel.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [112]:
myModel.compile(loss='binary_crossentropy')
myModel.fit(x_train, y_train, class_weight={0 : 1.0, 1 : 1.55})

[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.5970


<keras.src.callbacks.history.History at 0x36872a420>

In [113]:
predictionRaw = myModel.predict(x_test) 
print(predictionRaw)
predictionProbability = predictionRaw.flatten()
prediction = predictionProbability.round()

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[[0.0123358 ]
 [0.80099803]
 [0.8918076 ]
 ...
 [0.00536963]
 [0.29570377]
 [0.9603249 ]]


In [114]:
test_y = pd.DataFrame(y_test)
performanceDataFrame = pd.DataFrame(
                       data = {
                               'Prediction':prediction,
                               'Prediction_Probability':predictionProbability,
                               'Ground_Truth':test_y[0]
                               })

performanceDataFrame['90_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.90).astype(int)
performanceDataFrame['10_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.10).astype(int)

display(performanceDataFrame)
display(performanceDataFrame[performanceDataFrame['Prediction_Probability'] >= .90].shape)

Unnamed: 0,Prediction,Prediction_Probability,Ground_Truth,90_Probability,10_Probability
0,0.0,0.012336,0.0,0,0
1,1.0,0.800998,1.0,0,1
2,1.0,0.891808,1.0,0,1
3,0.0,0.033883,0.0,0,0
4,1.0,0.601748,1.0,0,1
...,...,...,...,...,...
2033,1.0,0.635620,1.0,0,1
2034,1.0,0.975192,1.0,1,1
2035,0.0,0.005370,0.0,0,0
2036,0.0,0.295704,0.0,0,1


(373, 5)

In [115]:
crosstabpred = pd.crosstab(performanceDataFrame['Prediction'],
                           performanceDataFrame['Ground_Truth'], margins= False)

crosstab10 = pd.crosstab(performanceDataFrame['10_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

crosstab90 = pd.crosstab(performanceDataFrame['90_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

display(crosstabpred, crosstab10, crosstab90)

Ground_Truth,0.0,1.0
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1387,100
1.0,69,482


Ground_Truth,0.0,1.0
10_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1039,14
1,417,568


Ground_Truth,0.0,1.0
90_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1440,225
1,16,357


In [116]:
# ROC Curve & AUC
falsePos, truePos, thresh = roc_curve(y_test, prediction)
areaUnderCurve = roc_auc_score(y_test, prediction)
precision = precision_score(y_test, prediction)
print(f'False Positive rate of {falsePos[1]:.4f} \nTrue Positive rate {truePos[1]:.4f}')
print(f'\nPrecision Score of {precision:.4f} \nArea Under Curve {areaUnderCurve:.4f}')


False Positive rate of 0.0474 
True Positive rate 0.8282

Precision Score of 0.8748 
Area Under Curve 0.8904
