In [106]:
import pandas as pd 
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import precision_score
import tensorflow as tf

In [107]:
# Loading in the Data & splitting it into variables 
data = np.load('disprot_esm_embed_10188.npz')
# feature data
x_data = data['X']
# ground truth 
y_data = data['y']
x_data.shape

(10188, 1280)

In [None]:
# splitting the data into training and testing data 
x_train, x_test, y_train, y_test = train_test_split(
                                   x_data, y_data, test_size = 0.20, 
                                   random_state = 42)

# Failed initial Iteration 
```
ogModel = tf.keras.Sequential() 
ogModel.add(tf.keras.layers.Dense(units=25, input_shape=(1280,), activation='relu'))
```
* failure point over compression of features 
    * makes the model inconsistent as depending on how it compresses 1280 features into 25 features it could have a model loss of 4.0 or 0.6
```
ogModel.add(tf.keras.layers.Dense(units=15, activation='sigmoid'))
ogModel.add(tf.keras.layers.Dense(units=15, activation='relu'))
ogModel.add(tf.keras.layers.Dense(units=1, activation='relu'))
```
* short comming of the model with the over compression at the first step having only 4 layers isn't enough to accurately capture the realtionships for prediction 

In [195]:
# initial number of features: 1280
myModel = tf.keras.Sequential()
# 3/4 compression 
myModel.add(tf.keras.layers.Dense(units = 640, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 480, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 360, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 270, activation = 'sigmoid'))
# 2/3 compression
myModel.add(tf.keras.layers.Dense(units = 180, activation = 'relu'))
# 1/3 compression
myModel.add(tf.keras.layers.Dense(units = 60, activation = 'relu'))
myModel.add(tf.keras.layers.Dense(units = 20, activation = 'sigmoid'))
# 1/4 compression
myModel.add(tf.keras.layers.Dense(units = 5, activation = 'relu'))
# Output layer 
myModel.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [196]:
myModel.compile(loss='binary_crossentropy')

In [None]:
myModel.fit(x_train, y_train)

[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.5326


<keras.src.callbacks.history.History at 0x168a47d70>

In [198]:
predictionRaw = myModel.predict(x_test) 
print(predictionRaw)
predictionProbability = predictionRaw.flatten()
prediction = predictionProbability.round()

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[0.08940548]
 [0.834759  ]
 [0.87911105]
 ...
 [0.08608283]
 [0.08816461]
 [0.8248428 ]]


In [199]:
test_y = pd.DataFrame(y_test)

In [200]:
performanceDataFrame = pd.DataFrame(
                       data = {
                               'Prediction':prediction,
                               'Prediction_Probability':predictionProbability,
                               'Ground_Truth':test_y[0]
                               })

performanceDataFrame['90_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.90).astype(int)
performanceDataFrame['10_Probability'] = (performanceDataFrame['Prediction_Probability'] >= 0.10).astype(int)

In [201]:
display(performanceDataFrame)
display(performanceDataFrame[performanceDataFrame['Prediction_Probability'] >= .90].shape)

Unnamed: 0,Prediction,Prediction_Probability,Ground_Truth,90_Probability,10_Probability
0,0.0,0.089405,0.0,0,0
1,1.0,0.834759,1.0,0,1
2,1.0,0.879111,1.0,0,1
3,0.0,0.086549,0.0,0,0
4,0.0,0.125439,1.0,0,1
...,...,...,...,...,...
2033,1.0,0.550600,1.0,0,1
2034,1.0,0.900488,1.0,1,1
2035,0.0,0.086083,0.0,0,0
2036,0.0,0.088165,0.0,0,0


(107, 5)

In [None]:
crosstabpred = pd.crosstab(performanceDataFrame['Prediction'],
                           performanceDataFrame['Ground_Truth'], margins= False)

crosstab10 = pd.crosstab(performanceDataFrame['10_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

crosstab90 = pd.crosstab(performanceDataFrame['90_Probability'],
                         performanceDataFrame['Ground_Truth'], margins= False)

In [203]:
display(crosstabpred, crosstab10, crosstab90)

Ground_Truth,0.0,1.0
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1426,205
1.0,30,377


Ground_Truth,0.0,1.0
10_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1388,142
1,68,440


Ground_Truth,0.0,1.0
90_Probability,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1453,478
1,3,104


### **General prediction** has an improved false positive rate but has a increaseed false negative rate that 
* could be improved with tuning or feature engineering of the data 
* Next iteration will be trained and tested on data that has been engineered to have more positive cases 
    * the model seems to struggle with positive cases as it doesn't have a lot of positive cases to train on

### **10% probability** has low false negatives but very high false positive rate
* too sensitive not enough specificity 
### **90% probability** has next to no false positives but a sizeable amount of false negatives 
* need to increase sensitivity even at the cost of specificity 