In [None]:
import numpy as np
import pandas as pd
import os
import sys
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Dense, Conv2D, BatchNormalization, Activation, AveragePooling2D, GlobalAveragePooling2D, Input, concatenate, MaxPooling2D
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler, ReduceLROnPlateau
from keras.regularizers import l2
from keras import backend as K
from sklearn.metrics import f1_score, precision_recall_curve, roc_curve, confusion_matrix, auc
import pickle
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

debug = True

dataFileName = '/kaggle/input/concatenated-pca/concatenated_data_Amazon_PCA.csv'

# Load the dataset using pandas
df = pd.read_csv(dataFileName)

# Assume the first two columns are not required and the last column is the label
x = df.iloc[:, :-1].values 
y = df.iloc[:, -1].values  

# Convert labels to binary format (if not already in binary form)
y = y.astype(int)

# Split into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Reshape after scaling if using CNN
x_train = x_train.reshape(-1, 1, x_train.shape[1], 1)
x_test = x_test.reshape(-1, 1, x_test.shape[1], 1)


# Define the DenseNet model
def dense_block(x, blocks, growth_rate):
    for i in range(blocks):
        x = conv_block(x, growth_rate)
    return x

# Reduce the pooling size in the transition block function
def transition_block(x, reduction, pool_size=1):  # Reduce pool size to 1x1 if necessary
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(int(K.int_shape(x)[-1] * reduction), 1, use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(1e-4))(x)
    if K.int_shape(x)[1] >= pool_size and K.int_shape(x)[2] >= pool_size:
        x = AveragePooling2D(pool_size=pool_size)(x)
    return x

def conv_block(x, growth_rate):
    x1 = BatchNormalization()(x)
    x1 = Activation('relu')(x1)
    x1 = Conv2D(4 * growth_rate, 1, use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(1e-4))(x1)
    x1 = BatchNormalization()(x1)
    x1 = Activation('relu')(x1)
    x1 = Conv2D(growth_rate, 3, padding='same', use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(1e-4))(x1)
    x = concatenate([x, x1])
    return x

def densenet_model(input_shape, depth, num_classes=1, growth_rate=12, reduction=0.5):
    if (depth - 4) % 3 != 0:
        raise ValueError('Depth must be 3N + 4')
    num_dense_blocks = (depth - 4) // 3

    inputs = Input(shape=input_shape)
    x = Conv2D(2 * growth_rate, (3, 3), padding='same', use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(1e-4))(inputs)
    x = dense_block(x, num_dense_blocks, growth_rate)
    x = transition_block(x, reduction)
    x = dense_block(x, num_dense_blocks, growth_rate)
    x = transition_block(x, reduction)
    x = dense_block(x, num_dense_blocks, growth_rate)
    x = GlobalAveragePooling2D()(x)
    x = Dense(num_classes, activation='sigmoid', kernel_initializer='he_normal')(x)

    model = Model(inputs=inputs, outputs=x)
    return model

# Model parameters
input_shape = x_train.shape[1:]
depth = 22  # Adjust depth as needed
model = densenet_model(input_shape=input_shape, depth=depth)

model.compile(optimizer=Adam(lr=1e-1), loss='binary_crossentropy', metrics=['accuracy'])

# Model training
batch_size = 128
epochs = 30
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

# Evaluate the model on the validation set
scores = model.evaluate(x_test, y_test, verbose=1)
y_preds = (model.predict(x_test) > 0.5).astype(int)
y_pred_binary = (y_preds > 0.5).astype(int)

f1 = f1_score(y_test.ravel(), y_preds.ravel(), average='macro')
tn, fp, fn, tp = confusion_matrix(y_test.ravel(), y_pred_binary.ravel()).ravel()
tpr1 = tp / (tp + fn)
fpr1 = fp / (fp + tn)
cm = confusion_matrix(y_test.ravel(), y_preds.ravel())

# Calculate ROC and Precision-Recall curves
fpr, tpr, _ = roc_curve(y_test.ravel(), y_preds.ravel())
roc_auc = auc(fpr, tpr)
precision, recall, _ = precision_recall_curve(y_test.ravel(), y_preds.ravel())
prc_auc = auc(recall, precision)

# Save metrics and curves
metrics = {
    'Validation Loss': scores[0],
    'Validation Accuracy': scores[1],
    'F1 Score': f1,
    'True Positive Rate' : tpr1,
    'False Positive Rate' : fpr1,
    'Confusion Matrix': cm.tolist(),
    'ROC AUC': roc_auc,
    'PRC AUC': prc_auc,
    'FPR Array': fpr.tolist(),
    'TPR Array': tpr.tolist(),
    'Precision Array': precision.tolist(),
    'Recall Array': recall.tolist()
}
with open('evaluation_metrics_real_life_PCA.json', 'w') as file:
    json.dump(metrics, file)

# Plot and save ROC Curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('roc_curve_real_life_PCA.png')

# Plot and save Precision-Recall Curve
plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (area = %0.2f)' % prc_auc)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.savefig('precision_recall_curve_real_life_PCA.png')