In [2]:
import findspark
findspark.init()
findspark.find()

'H:\\SPARK'

In [3]:
from pyspark.sql import SparkSession

# Initialize SparkSession with necessary configurations
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Spark') \
    .config("spark.driver.memory", "15g") \
    .config("spark.hadoop.home.dir", "H:/HADOOP/") \
    .config("spark.hadoop.conf.dir", "H:/HADOOP/etc/hadoop/") \
    .getOrCreate()
    
import sys
sys.path.append("G:\Dissertation_Project")

# Get SparkContext from the SparkSession
sc = spark.sparkContext


In [4]:
spark

### BASE DATASET

In [5]:
base_df = spark.read.csv("../../Data/Custom_Datasets/conversation_datasets_GPT.csv", header=True, inferSchema=True)
base_df.show(10, truncate=False)

+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|Conversation_ID|Attacker_Helper                                                                                                                                                 |Victim                                                                                                                                                                                         |Conversation_Type|
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------

### PREPROCESSED DATASET

In [6]:
preprocessed_df = spark.read.csv("../../Data/Preprocessed_Datasets/DATASET_FINAL_PREPROCESSED.csv", header=True, inferSchema=True)
preprocessed_df.show(10, truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Convert Conversation Columns into actual Arrays

In [7]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import ast

# UDF to convert string representation of list to actual list
def str_to_array_of_arrays(s):
    # Convert the string to a list and then wrap it inside another list
    return [ast.literal_eval(s)][0]

str_to_array_of_arrays_udf = udf(str_to_array_of_arrays, ArrayType(ArrayType(StringType())))

df = preprocessed_df.withColumn("Attacker_Helper", str_to_array_of_arrays_udf(preprocessed_df["Attacker_Helper"])).withColumn("Victim", str_to_array_of_arrays_udf(preprocessed_df["Victim"]))

df.printSchema()

root
 |-- Conversation_ID: string (nullable = true)
 |-- Attacker_Helper: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Victim: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Conversation_Type: integer (nullable = true)



### Loading the pipeline and transforming the data

In [8]:
from pyspark.ml import PipelineModel
from src.CustonTransformers import FlattenTransformer

pipeline_model_path = "./Pipelines/TF-IDF_Pipeline"

pipeline = PipelineModel.load(path=pipeline_model_path)

df_assembled = pipeline.transform(df)

### Splitting the Data

In [9]:
(train_data, test_data) = df_assembled.randomSplit([0.8, 0.2])
train_data.show(truncate=False)

+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# NEURAL NETWORK

## Creating a feed forward neural network using keras. 

In [10]:
import tensorflow as tf

### Preparing the data to be used as input in the Neural Network

In [11]:
import numpy 

labels = train_data.select('Conversation_Type').rdd.map(lambda row: row.Conversation_Type).collect()
labels_numpy_train = numpy.array(labels)

print("Shape of -labels_numpy_train- array --> {}".format(labels_numpy_train.shape))

features = train_data.select('combined_features').rdd.map(lambda row: row.combined_features).collect()
features_numpy_train = numpy.array(features)

print("Shape of -features_numpy_train- array --> {}".format(features_numpy_train.shape))

###############################################################################################################

labels = test_data.select('Conversation_Type').rdd.map(lambda row: row.Conversation_Type).collect()
labels_numpy_test = numpy.array(labels)

print("Shape of -labels_numpy_test- array --> {}".format(labels_numpy_test.shape))

features = test_data.select('combined_features').rdd.map(lambda row: row.combined_features).collect()
features_numpy_test = numpy.array(features)

print("Shape of -features_numpy_test- array --> {}".format(features_numpy_test.shape))

Shape of -labels_numpy_train- array --> (2888,)
Shape of -features_numpy_train- array --> (2888, 400)
Shape of -labels_numpy_test- array --> (700,)
Shape of -features_numpy_test- array --> (700, 400)


## HYPER PARAMETER TUNING

### Model Definition

In [12]:
from kerastuner import HyperModel
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Nadam
from src.CustomNNMetrics import F1Score


class NNHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        # Layer 1
        model.add(Dense(units=hp.Int('units_first_layer', min_value=16, max_value=128, step=16), 
                        activation='relu', 
                        input_shape=(400,),
                        kernel_regularizer=l1_l2(l1=hp.Float('l1_first_layer', min_value=1e-7, max_value=1e-1, sampling='LOG'),
                                                 l2=hp.Float('l2_first_layer', min_value=1e-7, max_value=1e-1, sampling='LOG'))))
        
        model.add(Dropout(rate=hp.Float('dropout_first_layer', min_value=0.0, max_value=0.7, step=0.05)))
        
        # Layer 2
        model.add(Dense(units=hp.Int('units_second_layer', min_value=16, max_value=128, step=16), 
                        activation='relu', 
                        kernel_regularizer=l1_l2(l1=hp.Float('l1_second_layer', min_value=1e-7, max_value=1e-1, sampling='LOG'),
                                                 l2=hp.Float('l2_second_layer', min_value=1e-7, max_value=1e-1, sampling='LOG'))))

        model.add(Dropout(rate=hp.Float('dropout_second_layer', min_value=0.0, max_value=0.7, step=0.05)))
        
        # Layer 3
        model.add(Dense(units=hp.Int('units_third_layer', min_value=16, max_value=128, step=16), 
                        activation='relu', 
                        kernel_regularizer=l1_l2(l1=hp.Float('l1_third_layer', min_value=1e-7, max_value=1e-1, sampling='LOG'),
                                                 l2=hp.Float('l2_third_layer', min_value=1e-7, max_value=1e-1, sampling='LOG'))))
        
        model.add(Dropout(rate=hp.Float('dropout_third_layer', min_value=0.0, max_value=0.7, step=0.05)))
        
        # Output Layer
        model.add(Dense(1, activation='sigmoid'))
        
        optimizer_choice = hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop', 'nadam'])
        
        if optimizer_choice == 'adam':
            optimizer = Adam(
                learning_rate=hp.Float('adam_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                beta_1=hp.Float('adam_beta_1', 0.85, 0.95, sampling='LOG'),
                beta_2=hp.Float('adam_beta_2', 0.99, 0.999, sampling='LOG')
            )
        elif optimizer_choice == 'sgd':
            optimizer = SGD(
                learning_rate=hp.Float('sgd_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                momentum=hp.Float('sgd_momentum', 0.1, 0.9, sampling='LOG')
            )
        elif optimizer_choice == 'rmsprop':
            optimizer = RMSprop(
                learning_rate=hp.Float('rmsprop_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                rho=hp.Float('rmsprop_rho', 0.85, 0.95, sampling='LOG')
            )
        elif optimizer_choice == 'nadam':
            optimizer = Nadam(
                learning_rate=hp.Float('nadam_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                beta_1=hp.Float('nadam_beta_1', 0.85, 0.95, sampling='LOG'),
                beta_2=hp.Float('nadam_beta_2', 0.99, 0.999, sampling='LOG')
            )
            
        model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), F1Score()])
        
        return model


  from kerastuner import HyperModel


### Tuner Configuration

In [13]:
from kerastuner.tuners import Hyperband
from kerastuner import Objective

hyper_model = NNHyperModel()

tuner = Hyperband(
    hyper_model,
    objective=Objective("val_loss", direction="min"),
    max_epochs=200,
    directory='../../Logs/Keras_Tuning',
    project_name='../../Logs/Keras_Tuning/Neural_Network_Tuning'
)

Reloading Tuner from ../../Logs/Keras_Tuning\../../Logs/Keras_Tuning/Neural_Network_Tuning\tuner0.json


### Searching for the Best Hyperparameters & Retrieving Best Model

In [14]:
tuner.search(features_numpy_train, labels_numpy_train,
             epochs=200,
             validation_split=0.2)

best_model = tuner.get_best_models(num_models=1)[0]

### Reviewing the tuning results

In [15]:
# Get the best trial
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]

# Print the best trial's details
print(f"Best Trial ID: {best_trial.trial_id}")
print(f"Best Trial Score: {best_trial.score}")

best_hyperparameters = best_trial.hyperparameters.values

print("Best Hyperparameters:")
for param, value in best_hyperparameters.items():
    print(f"\t{param}: {value}")

Best Trial ID: 0098
Best Trial Score: 0.39723142981529236
Best Hyperparameters:
	units_first_layer: 16
	l1_first_layer: 2.3291627372307405e-07
	l2_first_layer: 1.714878366156072e-07
	dropout_first_layer: 0.30000000000000004
	units_second_layer: 16
	l1_second_layer: 2.623265171464705e-06
	l2_second_layer: 2.594526939505927e-06
	dropout_second_layer: 0.4
	units_third_layer: 112
	l1_third_layer: 1.9427068813699489e-07
	l2_third_layer: 5.0632194967712215e-06
	dropout_third_layer: 0.6000000000000001
	optimizer: nadam
	adam_learning_rate: 0.0019693142806088922
	adam_beta_1: 0.9186224762463757
	adam_beta_2: 0.9941689099700385
	rmsprop_learning_rate: 0.001188587767282574
	rmsprop_rho: 0.9212952589419752
	nadam_learning_rate: 0.0038369690781954366
	nadam_beta_1: 0.9273480593126637
	nadam_beta_2: 0.9964712350678965
	sgd_learning_rate: 0.0008758094397234087
	sgd_momentum: 0.32406224717286486
	tuner/epochs: 8
	tuner/initial_epoch: 3
	tuner/bracket: 4
	tuner/round: 1
	tuner/trial_id: 0080


### Best Model

In [16]:
test_loss, test_accuracy, test_precision, test_recall, test_f1score = best_model.evaluate(features_numpy_test, labels_numpy_test)

print(f"test_accuracy --> {test_accuracy}")
print(f"test_loss --> {test_loss}")
print(f"test_precision --> {test_precision}")
print(f"test_recall --> {test_recall}")
print(f"test_f1score --> {test_f1score}")

test_accuracy --> 0.8914285898208618
test_loss --> 0.27653926610946655
test_precision --> 0.9243353605270386
test_recall --> 0.9205702543258667
test_f1score --> 0.9224488735198975


### Saving the model

In [17]:
best_model.save("../Models/Trained_Models/NeuralNetwork_TFIDF/NeuralNetwork_TFIDF.h5")

  saving_api.save_model(
