In [None]:
from __future__ import print_function
import keras
from keras.layers import Dense, Conv2D, BatchNormalization, Activation
from keras.layers import AveragePooling2D, Input, Flatten, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras import backend as K
from sklearn.model_selection import train_test_split
from keras.models import Model
import numpy as np
from numpy import loadtxt
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from imblearn.over_sampling import SMOTE
from sklearn import metrics
import os
import sys
from tensorflow.keras.models import load_model
debug = True


f1_scores = []
fpr = []
tpr = []
            
 #Load dataset from CSV
dataFileName = '/kaggle/input/combined-dataset-before-smote/output-2.csv'  # Update this path to your CSV file
# Load the dataset
raw_dataset = np.genfromtxt(dataFileName, delimiter=',', dtype=str)  # Update delimiter if necessary
dataset = raw_dataset[:, 2:]  # Skip UID RID, adjust if your CSV structure is different

# Split dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

batch_size = 16  # trained all networks with batch_size=16

# format of the dataset
# <uid rid> <8-13 user-metadata values> <8-13 resource-metadata values> <4 operations>
metadata = train_dataset.shape[1] - 4
x_train = to_categorical(train_dataset[:, :metadata])
x_test = to_categorical(val_dataset[:, :metadata])

# Reshape to (number_of_samples, 24 * num_classes)
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Add an extra dimension
x_train = x_train[..., np.newaxis]
x_test = x_test[..., np.newaxis]
y_train = train_dataset[:, metadata:].astype(int)
y_test = val_dataset[:, metadata:].astype(int)


# determine number of metadata to be hide
# we will expose first eight user and first eight resource metadata to the model
# there are four operations
# 8 + 8 + 4 = 20
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(x_test)
#print(X_train.shape)
#print(y_test.shape)
#print(y_train.shape)
#print(y_test.shape)

model = Sequential()

#Adjust input_dim according to the number of features in your data
model.add(Dense(64, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))  # Use 'sigmoid' for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 60 epochs seems to be the sweet spot; tried 10, 20, 30, and 100
# batch size seems better at 16; tried 8, 32, 64 
model.fit(x_train, y_train, epochs=60, batch_size=16, validation_data=(x_test, y_test))

y_pred = model.predict(x_test)
score = model.evaluate(x_test, y_test, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1]) 

outputFileName = 'vgg16_nosmote'
DIR_ASSETS = 'results/'
PATH_MODEL = DIR_ASSETS + outputFileName + '.hdf5'

if debug:
  print('Saving trained vgg16 to {}.'.format(PATH_MODEL))
if not os.path.isdir(DIR_ASSETS):
    os.mkdir(DIR_ASSETS)
model.save(PATH_MODEL)

# measure True Positive/ Negative, False Positive/ Negative

from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn import metrics

y_pred = model.predict(x_test)

# Flatten the predicted and actual values
y_pred_flat = y_pred.flatten()
y_test_flat = y_test.flatten()

# Convert to binary classification (example)
# Adjust this step according to your specific problem
threshold = 0.5
y_pred_binary = (y_pred_flat > threshold).astype(int)

# Reshape y_test_flat to match the shape of y_pred_flat
y_test_binary = (y_test_flat[:len(y_pred_binary)] > threshold).astype(int)
#y_pred = to_categorical(y_pred)
f1 = f1_score(y_test_binary, y_pred_binary, average='weighted')
# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)

cm = confusion_matrix(y_test_binary.ravel(), y_test_binary.ravel())
#np.savetxt('cm.txt', cm, delimiter=',', fmt='%f')
#f1 = f1_score(y_test.ravel(), y_pred.ravel())
tpr_value = cm[1, 1] / (cm[1, 1] + cm[1, 0])
fpr_value = cm[0, 1] / (cm[0, 1] + cm[0, 0])

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision Score:", precision)
print("Recall Score:", recall)
print("TPR Score:", tpr_value)
print("FPR Score:", fpr_value)


In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc

precision_a, recall_a, threshold = precision_recall_curve(y_test_binary, y_pred_binary)
prc_auc = auc(recall_a, precision_a)
print("Area Under the PR Curve score: ", prc_auc)


fpr, tpr, threshold = roc_curve(y_test_binary, y_pred_binary)
roc_auc = auc(fpr, tpr)
print("Area Under the ROC Curve score: ", roc_auc)

In [None]:
import json

# Assuming tpr and fpr are NumPy arrays
print("Type of f1:", type(f1))
print("Type of precision:", type(precision))
print("Type of recall:", type(recall))
print("Type of tpr_value:", type(tpr_value))
print("Type of fpr_value:", type(fpr_value))
print("Type of roc_auc:", type(roc_auc))
print("Type of prc_auc:", type(prc_auc))
print("Type of tpr:", type(tpr))
print("Type of fpr:", type(fpr))

precision_list = precision_a.tolist()
recall_list = recall_a.tolist()
tpr_list = tpr.tolist()
fpr_list = fpr.tolist()

#print(f1.type())

data = {
    "Average F1 Score": f1, 
    "Average Precision": precision, 
    "Average Recall": recall, 
    "Average True Positive Rate": tpr_value, 
    "Average False Positive Rate": fpr_value, 
    "Average ROC AUC": roc_auc, 
    "Average PRC AUC": prc_auc,
    "TPR Array" : tpr_list,
    "FPR Array" : fpr_list,
    "Precision Array": precision_list, 
    "Recall Array": recall_list
}

# Specify the file path
file_path = "/kaggle/working/VGG-16 - Synthetic.json"

# Write the data to a JSON file
with open(file_path, "w") as json_file:
    json.dump(data, json_file)

In [None]:
# Plot ROC curve
#plt.figure()
#plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
#plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
#plt.xlabel('False Positive Rate')
#plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC) Curve')
#plt.legend(loc="lower right")
#plt.savefig('/kaggle/working/roc_curve.png')
#plt.show()

# Plot Precision-Recall curve
#plt.figure()
#plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (AUC = %0.2f)' % average_precision)
#plt.xlabel('Recall')
#plt.ylabel('Precision')
#plt.title('Precision-Recall Curve')
#plt.legend(loc="lower left")
#plt.savefig('/kaggle/working/precision_recall_curve.png')
#plt.show()