In [2]:
import findspark
findspark.init()
findspark.find()

'H:\\SPARK'

In [3]:
from pyspark.sql import SparkSession

# Initialize SparkSession with necessary configurations
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Spark') \
    .config("spark.driver.memory", "15g") \
    .config("spark.hadoop.home.dir", "H:/HADOOP/") \
    .config("spark.hadoop.conf.dir", "H:/HADOOP/etc/hadoop/") \
    .getOrCreate()
    
import sys
sys.path.append("G:\Dissertation_Project")

# Get SparkContext from the SparkSession
sc = spark.sparkContext


In [4]:
spark

### BASE DATASET

In [5]:
base_df = spark.read.csv("../../Data/Custom_Datasets/conversation_datasets_GPT.csv", header=True, inferSchema=True)
base_df.show(10, truncate=False)

+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|Conversation_ID|Attacker_Helper                                                                                                                                                 |Victim                                                                                                                                                                                         |Conversation_Type|
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------

### PREPROCESSED DATASET

In [6]:
preprocessed_df = spark.read.csv("../../Data/Preprocessed_Datasets/DATASET_FINAL_PREPROCESSED.csv", header=True, inferSchema=True)
preprocessed_df.show(10, truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Convert Conversation Columns into actual Arrays

In [7]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import ast

# UDF to convert string representation of list to actual list
def str_to_array_of_arrays(s):
    # Convert the string to a list and then wrap it inside another list
    return [ast.literal_eval(s)][0]

str_to_array_of_arrays_udf = udf(str_to_array_of_arrays, ArrayType(ArrayType(StringType())))

df = preprocessed_df.withColumn("Attacker_Helper", str_to_array_of_arrays_udf(preprocessed_df["Attacker_Helper"])).withColumn("Victim", str_to_array_of_arrays_udf(preprocessed_df["Victim"]))

df.printSchema()

root
 |-- Conversation_ID: string (nullable = true)
 |-- Attacker_Helper: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Victim: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Conversation_Type: integer (nullable = true)



# NEURAL NETWORK

## Creating a feed forward neural network using keras. 

In [8]:
import tensorflow as tf

### Preparing the data to be used as input in the Neural Network

In [11]:
import pandas as pd 


dataframe = df.toPandas()

attacker_texts = dataframe['Attacker_Helper'].values
victim_texts = dataframe['Victim'].values

labels = dataframe['Conversation_Type'].values

# Ensure attacker_texts is a flat list of strings
flat_attacker_texts = [item for sublist in attacker_texts for item in (sublist if isinstance(sublist, list) else [sublist])]

# Ensure victim_texts is a flat list of strings
flat_victim_texts = [item for sublist in victim_texts for item in (sublist if isinstance(sublist, list) else [sublist])]

print(flat_attacker_texts[0])
print(flat_victim_texts[0])

['hi', 'thi', 'is', 'john', 'from', 'the', 'ir', 'you', 'owe', 'us', 'xxxxx', 'in', 'back', 'tax']
['hi', 'thi', 'is', 'john', 'from', 'the', 'ir', 'you', 'owe', 'us', 'xxxxx', 'in', 'back', 'tax', 'ye', 'of', 'cours', 'it', 'xxxx']
['im', 'sorri', 'i', 'dont', 'believ', 'you', 'can', 'you', 'provid', 'me', 'with', 'your', 'badg', 'number']


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np



tokenizer = Tokenizer(num_words=10000)
# Now, fit the tokenizer on the flattened text data
tokenizer.fit_on_texts(flat_attacker_texts + flat_victim_texts)


attacker_sequences = tokenizer.texts_to_sequences(flat_attacker_texts)
victim_sequences = tokenizer.texts_to_sequences(flat_victim_texts)

print(attacker_sequences[0])
print(victim_sequences[0])

# Find the maximum length from both columns for padding
max_length_attacker = max([len(seq) for seq in attacker_sequences])
max_length_victim = max([len(seq) for seq in victim_sequences])
max_sequence_length = max(max_length_attacker, max_length_victim)  # Use the max length found for padding

# Pad sequences to have the same length
attacker_data = pad_sequences(attacker_sequences, maxlen=max_sequence_length)
victim_data = pad_sequences(victim_sequences, maxlen=max_sequence_length)



combined_data = np.concatenate([attacker_data, victim_data], axis=1)  # Concatenate along the sequence length

# Get the size of the vocabulary (adding 1 because Tokenizer is 1-indexed)
vocab_size = len(tokenizer.word_index) + 1

print("Shape of Combined Data:", combined_data.shape)

[339, 8, 9, 50, 30, 3, 62, 2, 170, 121, 303, 35, 94, 74]
[27, 81, 5, 55, 268, 2, 7, 2, 16, 23, 20, 1, 284, 21]
Shape of Combined Data: (3588, 766)


### Saving the tokenizer

In [13]:
import os
folder_path = '../Models/Tokenizers'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
file_path = os.path.join(folder_path, 'tokenizer.json')

tokenizer_json = tokenizer.to_json()

with open(file_path, 'w', encoding='utf-8') as file:
    file.write(tokenizer_json) 

### Splitting the data into training and test sets.

In [10]:
from sklearn.model_selection import train_test_split

# Assuming combined_data is your feature data and labels are your target labels
X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=0.2)

print("Training data shape", X_train.shape)
print("Training labels shape", y_train.shape)

print("Test data shape", X_test.shape)
print("Test labels shape", y_test.shape)

Training data shape (2870, 766)
Training labels shape (2870,)
Test data shape (718, 766)
Test labels shape (718,)


## HYPER PARAMETER TUNING

### Model Definition

In [11]:
from kerastuner import HyperModel
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Nadam
from src.CustomNNMetrics import F1Score


class NNHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        
        # Embedding Layer
        # Embedding Layer as the first layer
        model.add(Embedding(input_dim=hp.Int('vocab_size', min_value=1000, max_value=10000, step=1000),  # Vocabulary size
                            output_dim=hp.Int('embedding_output_dim', min_value=32, max_value=128, step=16),  # Output dimension of the embeddings
                            input_length=766)) 
        
        model.add(Flatten())
        
        # Layer 1
        model.add(Dense(units=hp.Int('units_first_layer', min_value=16, max_value=128, step=16), 
                        activation='relu', 
                        kernel_regularizer=l1_l2(l1=hp.Float('l1_first_layer', min_value=1e-7, max_value=1e-2, sampling='LOG'),
                                                 l2=hp.Float('l2_first_layer', min_value=1e-7, max_value=1e-2, sampling='LOG'))))
        
        model.add(Dropout(rate=hp.Float('dropout_first_layer', min_value=0.0, max_value=0.7, step=0.05)))
        
        # Layer 2
        model.add(Dense(units=hp.Int('units_second_layer', min_value=16, max_value=128, step=16), 
                        activation='relu', 
                        kernel_regularizer=l1_l2(l1=hp.Float('l1_second_layer', min_value=1e-7, max_value=1e-2, sampling='LOG'),
                                                 l2=hp.Float('l2_second_layer', min_value=1e-7, max_value=1e-2, sampling='LOG'))))

        model.add(Dropout(rate=hp.Float('dropout_second_layer', min_value=0.0, max_value=0.7, step=0.05)))
        
        # Layer 3
        model.add(Dense(units=hp.Int('units_third_layer', min_value=16, max_value=128, step=16), 
                        activation='relu', 
                        kernel_regularizer=l1_l2(l1=hp.Float('l1_third_layer', min_value=1e-7, max_value=1e-2, sampling='LOG'),
                                                 l2=hp.Float('l2_third_layer', min_value=1e-7, max_value=1e-2, sampling='LOG'))))
        
        model.add(Dropout(rate=hp.Float('dropout_third_layer', min_value=0.0, max_value=0.7, step=0.05)))
        
        # Output Layer
        model.add(Dense(1, activation='sigmoid'))
        
        optimizer_choice = hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop', 'nadam'])
        
        if optimizer_choice == 'adam':
            optimizer = Adam(
                learning_rate=hp.Float('adam_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                beta_1=hp.Float('adam_beta_1', 0.85, 0.95, sampling='LOG'),
                beta_2=hp.Float('adam_beta_2', 0.99, 0.999, sampling='LOG')
            )
        elif optimizer_choice == 'sgd':
            optimizer = SGD(
                learning_rate=hp.Float('sgd_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                momentum=hp.Float('sgd_momentum', 0.1, 0.9, sampling='LOG')
            )
        elif optimizer_choice == 'rmsprop':
            optimizer = RMSprop(
                learning_rate=hp.Float('rmsprop_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                rho=hp.Float('rmsprop_rho', 0.85, 0.95, sampling='LOG')
            )
        elif optimizer_choice == 'nadam':
            optimizer = Nadam(
                learning_rate=hp.Float('nadam_learning_rate', 1e-5, 1e-2, sampling='LOG'),
                beta_1=hp.Float('nadam_beta_1', 0.85, 0.95, sampling='LOG'),
                beta_2=hp.Float('nadam_beta_2', 0.99, 0.999, sampling='LOG')
            )
            
        model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), F1Score()])
        
        return model


  from kerastuner import HyperModel


### Tuner Configuration

In [12]:
from kerastuner.tuners import Hyperband
from kerastuner import Objective

hyper_model = NNHyperModel()

tuner = Hyperband(
    hyper_model,
    objective=Objective("val_loss", direction="min"),
    max_epochs=200,
    directory='../../Logs/Keras_Tuning',
    project_name='../../Logs/Keras_Tuning/Neural_Network_Tuning_EMBEDDING'
)

Reloading Tuner from ../../Logs/Keras_Tuning\../../Logs/Keras_Tuning/Neural_Network_Tuning_EMBEDDING\tuner0.json


### Searching for the Best Hyperparameters & Retrieving Best Model

In [13]:
tuner.search(X_train, y_train,
             epochs=200,
             validation_split=0.2)



Trial 201 Complete [00h 01m 06s]
val_loss: 0.4803275465965271

Best val_loss So Far: 0.3113371431827545
Total elapsed time: 02h 16m 33s

Search: Running Trial #202

Value             |Best Value So Far |Hyperparameter
4000              |4000              |vocab_size
112               |64                |embedding_output_dim
48                |32                |units_first_layer
4.0708e-07        |1.7424e-07        |l1_first_layer
1.6901e-05        |1.9465e-05        |l2_first_layer
0.15              |0.25              |dropout_first_layer
48                |112               |units_second_layer
5.6587e-07        |0.00037539        |l1_second_layer
4.8931e-05        |0.00051632        |l2_second_layer
0.05              |0.6               |dropout_second_layer
16                |48                |units_third_layer
5.3134e-05        |8.235e-06         |l1_third_layer
1.2334e-07        |2.578e-06         |l2_third_layer
0.55              |0.05              |dropout_third_layer
rmsprop   

KeyboardInterrupt: 

### Reviewing the tuning results

In [14]:
# Get the best trial
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]

# Print the best trial's details
print(f"Best Trial ID: {best_trial.trial_id}")
print(f"Best Trial Score: {best_trial.score}")

best_hyperparameters = best_trial.hyperparameters.values

print("Best Hyperparameters:")
for param, value in best_hyperparameters.items():
    print(f"\t{param}: {value}")

Best Trial ID: 0188
Best Trial Score: 0.3113371431827545
Best Hyperparameters:
	vocab_size: 4000
	embedding_output_dim: 64
	units_first_layer: 32
	l1_first_layer: 1.7423896638458018e-07
	l2_first_layer: 1.946493037309278e-05
	dropout_first_layer: 0.25
	units_second_layer: 112
	l1_second_layer: 0.0003753897006106875
	l2_second_layer: 0.000516323481041702
	dropout_second_layer: 0.6000000000000001
	units_third_layer: 48
	l1_third_layer: 8.234990826115495e-06
	l2_third_layer: 2.577977945979165e-06
	dropout_third_layer: 0.05
	optimizer: rmsprop
	adam_learning_rate: 0.0005475517926888837
	adam_beta_1: 0.8685251271554257
	adam_beta_2: 0.9918240365811867
	rmsprop_learning_rate: 0.00040970392330878926
	rmsprop_rho: 0.9437790600037682
	nadam_learning_rate: 0.0003691391455505471
	nadam_beta_1: 0.9221111842170197
	nadam_beta_2: 0.9971394872235539
	sgd_learning_rate: 0.009856125192373095
	sgd_momentum: 0.1986927371253983
	tuner/epochs: 67
	tuner/initial_epoch: 23
	tuner/bracket: 2
	tuner/round: 1
	

### Best Model

In [16]:
best_model = tuner.get_best_models(num_models=1)[0]
test_loss, test_accuracy, test_precision, test_recall, test_f1score = best_model.evaluate(X_test, y_test)

print(f"test_accuracy --> {test_accuracy}")
print(f"test_loss --> {test_loss}")
print(f"test_precision --> {test_precision}")
print(f"test_recall --> {test_recall}")
print(f"test_f1score --> {test_f1score}")

test_accuracy --> 0.9220055937767029
test_loss --> 0.28385913372039795
test_precision --> 0.9490835070610046
test_recall --> 0.9376257658004761
test_f1score --> 0.9433197975158691


### Saving the model

In [18]:
best_model.save("../Models/Trained_Models/NeuralNetwork_EMBEDDING/NeuralNetwork_EMBEDDING.keras")